Files
clang-p2996/mlir/test/Dialect/NVGPU/tmaload-transform.mlir
Oleksandr "Alex" Zinenko e4384149b5 [mlir] use transform-interpreter in test passes (#70040)
Update most test passes to use the transform-interpreter pass instead of
the test-transform-dialect-interpreter-pass. The new "main" interpreter
pass has a named entry point instead of looking up the top-level op with
`PossibleTopLevelOpTrait`, which is arguably a more understandable
interface. The change is mechanical, rewriting an unnamed sequence into
a named one and wrapping the transform IR in to a module when necessary.

Add an option to the transform-interpreter pass to target a tagged
payload op instead of the root anchor op, which is also useful for repro
generation.

Only the test in the transform dialect proper and the examples have not
been updated yet. These will be updated separately after a more careful
consideration of testing coverage of the transform interpreter logic.
2023-10-24 16:12:34 +02:00

87 lines
4.8 KiB
MLIR

// RUN: mlir-opt %s \
// RUN: -transform-interpreter \
// RUN: -test-transform-dialect-erase-schedule \
// RUN: | FileCheck %s
memref.global "private" @bufferLhsGlobal : memref<64x8xf32, #gpu.address_space<workgroup>>
memref.global "private" @bufferRhsGlobal : memref<8x128xf32, #gpu.address_space<workgroup>>
// CHECK-LABEL: func.func @main()
func.func @main() {
%c1 = arith.constant 1 : index
%c128 = arith.constant 128 : index
%0 = gpu.wait async
%memref, %asyncToken = gpu.alloc async [%0] () : memref<64x8xf32>
%memref_1, %asyncToken_2 = gpu.alloc async [%0] () : memref<8x128xf32>
// CHECK: %[[M1:.*]] = memref.cast %{{.*}} : memref<64x8xf32> to memref<*xf32>
// CHECK: %[[c64:.*]] = arith.constant 64 : index
// CHECK: %[[c8:.*]] = arith.constant 8 : index
// CHECK: %[[D1:.*]] = nvgpu.tma.create.descriptor %[[M1]] box[%[[c64]], %[[c8]]]
// CHECK-SAME: : memref<*xf32> -> <tensor = memref<64x8xf32, #gpu.address_space<workgroup>>, swizzle = none, l2promo = none, oob = zero, interleave = none>
// CHECK: %[[cast_2:.*]] = memref.cast %memref_0 : memref<8x128xf32> to memref<*xf32>
// CHECK: %[[c8_2:.*]] = arith.constant 8 : index
// CHECK: %[[c128_2:.*]] = arith.constant 128 : index
// CHECK: %[[D2:.*]] = nvgpu.tma.create.descriptor %cast_2 box[%[[c8_2]], %[[c128_2]]]
// CHECK-SAME: : memref<*xf32> -> <tensor = memref<8x128xf32, #gpu.address_space<workgroup>>, swizzle = none, l2promo = none, oob = zero, interleave = none>
// CHECK: gpu.launch
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
threads(%tx, %ty, %tz) in (%block_x = %c128, %block_y = %c1, %block_z = %c1) {
// CHECK: %[[G1:.*]] = memref.get_global @bufferLhsGlobal : memref<64x8xf32, #gpu.address_space<workgroup>>
// CHECK: %[[G2:.*]] = memref.get_global @bufferRhsGlobal : memref<8x128xf32, #gpu.address_space<workgroup>>
%out = memref.get_global @bufferLhsGlobal : memref<64x8xf32, #gpu.address_space<workgroup>>
%out_1 = memref.get_global @bufferRhsGlobal : memref<8x128xf32, #gpu.address_space<workgroup>>
// CHECK: %[[B:.*]] = nvgpu.mbarrier.create -> <memorySpace = #gpu.address_space<workgroup>
// CHECK: nvgpu.mbarrier.init %[[B]][%{{.*}}], %{{.*}} : <memorySpace = #gpu.address_space<workgroup>
// CHECK: gpu.barrier
//
// CHECK: %[[c0:.*]] = arith.constant 0 : index
// CHECK: %[[TIDX:.*]] = gpu.thread_id x
// CHECK: %[[CMP:.*]] = arith.cmpi eq, %[[TIDX]], %[[c0]] : index
//
// CHECK: scf.if %[[CMP]] {
//
// CHECK: %[[c0_7:.*]] = arith.constant 0 : index
// CHECK: nvgpu.tma.async.load %[[D1]][%[[c0_7]], %[[c0_7]]], %[[B]][%{{.*}}] to %[[G1]]
// CHECK-SAME: : <tensor = memref<64x8xf32, #gpu.address_space<workgroup>>,
// CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>
// CHECK-SAME: -> memref<64x8xf32, #gpu.address_space<workgroup>>
//
// CHECK: %[[c0_8:.*]] = arith.constant 0 : index
// CHECK: nvgpu.tma.async.load %[[D2]][%[[c0_8]], %[[c0_8]]], %[[B]][%{{.*}}] to %[[G2]]
// CHECK-SAME: : <tensor = memref<8x128xf32, #gpu.address_space<workgroup>>,
// CHECK-SAME: swizzle = none, l2promo = none, oob = zero, interleave = none>, <memorySpace = #gpu.address_space<workgroup>
// CHECK-SAME: -> memref<8x128xf32, #gpu.address_space<workgroup>>
//
// CHECK: %[[c6144:.*]] = arith.constant 6144 : index
// CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c6144]] : <memorySpace = #gpu.address_space<workgroup>
// CHECK: } else {
// CHECK: %[[c0_7:.*]] = arith.constant 0 : index
// CHECK: nvgpu.mbarrier.arrive.expect_tx %[[B]][%{{.*}}], %[[c0_7]] : <memorySpace = #gpu.address_space<workgroup>
// CHECK: }
//
// CHECK: %[[c0_6:.*]] = arith.constant 0 : index
// CHECK: %[[c10000000:.*]] = arith.constant 10000000 : index
// CHECK: nvgpu.mbarrier.try_wait.parity %[[B]][%{{.*}}], %[[c0_6]], %[[c10000000]] : <memorySpace = #gpu.address_space<workgroup>
/// Both copies are matched and end up in the same async group.
linalg.copy ins(%memref: memref<64x8xf32>) outs(%out: memref<64x8xf32, #gpu.address_space<workgroup>>)
linalg.copy ins(%memref_1: memref<8x128xf32>) outs(%out_1: memref<8x128xf32, #gpu.address_space<workgroup>>)
gpu.terminator
}
return
}
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
%copy = transform.structured.match ops{["linalg.copy"]} in %arg1
: (!transform.any_op) -> !transform.any_op
transform.nvgpu.rewrite_copy_as_tma %copy : (!transform.any_op) -> ()
transform.yield
}
}