[AMDGPU] Adding AMDGPU dialect wrapper for ROCDL transpose loads. (#145395)
* 1-to-1 mapping wrapper op. * Direct lowering from AMDGPU wrapper to ROCDL intrinsics.
This commit is contained in:
56
mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir
Normal file
56
mlir/test/Conversion/AMDGPUToROCDL/transpose_load.mlir
Normal file
@@ -0,0 +1,56 @@
|
||||
// RUN: mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx950 | FileCheck %s
|
||||
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx945 2>&1 | FileCheck %s --check-prefix=CHECK-OLD
|
||||
|
||||
// CHECK-LABEL: func @transpose_load_to_rocdl_4xf16
|
||||
func.func @transpose_load_to_rocdl_4xf16(%idx1 : index, %idx2 : index, %wgmem : memref<128x72xf16, 3>) -> vector<4xf16> {
|
||||
// CHECK: rocdl.ds.read.tr16.b64
|
||||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x72xf16, 3> -> vector<4xf16>
|
||||
return %0 : vector<4xf16>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @transpose_load_to_rocdl_8xi8
|
||||
func.func @transpose_load_to_rocdl_8xi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x128xi8, 3>) -> vector<8xi8> {
|
||||
// CHECK: %[[RES:.*]] = rocdl.ds.read.tr8.b64
|
||||
// CHECK-SAME: -> vector<2xi32>
|
||||
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<8xi8>
|
||||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x128xi8, 3> -> vector<8xi8>
|
||||
return %0 : vector<8xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @transpose_load_to_rocdl_i4_memrefxi8
|
||||
func.func @transpose_load_to_rocdl_i4_memrefxi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi8, 3>) -> vector<16xi4> {
|
||||
// CHECK: %[[RES:.*]] = rocdl.ds.read.tr4.b64
|
||||
// CHECK-SAME: -> vector<2xi32>
|
||||
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<2xi32> to vector<16xi4>
|
||||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<16xi4>
|
||||
return %0 : vector<16xi4>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @transpose_load_to_rocdl_i6_memrefxi8
|
||||
func.func @transpose_load_to_rocdl_i6_memrefxi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi8, 3>) -> vector<16xi6> {
|
||||
// CHECK: %[[RES:.*]] = rocdl.ds.read.tr6.b96
|
||||
// CHECK-SAME: -> vector<3xi32>
|
||||
// CHECK-NEXT: llvm.bitcast %[[RES]] : vector<3xi32> to vector<16xi6>
|
||||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<16xi6>
|
||||
return %0 : vector<16xi6>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @transpose_load_to_rocdl_i16_memrefxi8
|
||||
func.func @transpose_load_to_rocdl_i16_memrefxi8(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi8, 3>) -> vector<4xi16> {
|
||||
// CHECK: rocdl.ds.read.tr16.b64
|
||||
// CHECK-OLD: error: 'amdgpu.transpose_load' op Non-gfx950 chipset not supported
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<4xi16>
|
||||
return %0 : vector<4xi16>
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
// RUN: not mlir-opt %s --split-input-file -convert-amdgpu-to-rocdl=chipset=gfx950 2>&1 | FileCheck %s
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_to_rocdl_16xi4(%idx1 : index, %idx2 : index, %wgmem : memref<128x16xi4, 3>) -> vector<16xi4> {
|
||||
// CHECK: memref to have at least 8 bits element size, got 4
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x16xi4, 3> -> vector<16xi4>
|
||||
return %0 : vector<16xi4>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_to_rocdl_16xi6(%idx1 : index, %idx2 : index, %wgmem : memref<128x32xi6, 3>) -> vector<16xi6> {
|
||||
// CHECK: memref to have at least 8 bits element size, got 6
|
||||
%0 = amdgpu.transpose_load %wgmem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<16xi6>
|
||||
return %0 : vector<16xi6>
|
||||
}
|
||||
@@ -166,3 +166,59 @@ func.func @swizzle_scalable_vec(%arg0 : vector<[4]xf32>) -> vector<[4]xf32> {
|
||||
%0 = amdgpu.swizzle_bitmode %arg0 1 2 4 : vector<[4]xf32>
|
||||
func.return %0 : vector<[4]xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_addrspace(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16, 1>) -> vector<4xf16> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op source memory address space must be Workgroup}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 1> -> vector<4xf16>
|
||||
func.return %0 : vector<4xf16>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_addrspace(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16, 1>) -> vector<4xf16> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op source memory address space must be Workgroup}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 1> -> vector<4xf16>
|
||||
func.return %0 : vector<4xf16>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_elem_f32(%idx1 : index, %idx2 : index, %mem : memref<128x32xf32, 3>) -> vector<4xf32> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op Unsupported element type size for transpose load: 32 bits}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf32, 3> -> vector<4xf32>
|
||||
func.return %0 : vector<4xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_vector_size_f16(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16, 3>) -> vector<2xf16> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op Transferring type size mismatch: expected num of elements: 4}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<2xf16>
|
||||
func.return %0 : vector<2xf16>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_vector_size_i4(%idx1 : index, %idx2 : index, %mem : memref<128x32xi4, 3>) -> vector<20xi4> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op Transferring type size mismatch: expected num of elements: 16}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi4, 3> -> vector<20xi4>
|
||||
func.return %0 : vector<20xi4>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : memref<128x32xi8, 3>) -> vector<20xi8> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op Transferring type size mismatch: expected num of elements: 8}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi8, 3> -> vector<20xi8>
|
||||
func.return %0 : vector<20xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func.func @transpose_load_vector_size_i8(%idx1 : index, %idx2 : index, %mem : memref<128x32xi6, 3>) -> vector<8xi6> {
|
||||
// expected-error@+1 {{'amdgpu.transpose_load' op Transferring type size mismatch: expected num of elements: 16}}
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xi6, 3> -> vector<8xi6>
|
||||
func.return %0 : vector<8xi6>
|
||||
}
|
||||
|
||||
@@ -486,3 +486,10 @@ func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : v
|
||||
%0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
|
||||
func.return %0 : vector<16xf32>
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @transpose_load
|
||||
func.func @transpose_load(%idx1 : index, %idx2 : index, %mem : memref<128x32xf16, 3>) -> vector<4xf16> {
|
||||
// CHECK: amdgpu.transpose_load
|
||||
%0 = amdgpu.transpose_load %mem[%idx1, %idx2] : memref<128x32xf16, 3> -> vector<4xf16>
|
||||
func.return %0 : vector<4xf16>
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user