Files
clang-p2996/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir
Thomas Raoux ef112833e1 [MLIR][SCF] Add support for pipelining dynamic loops (#74350)
Support loops without static boundaries. Since the number of iteration
is not known we need to predicate prologue and epilogue in case the
number of iterations is smaller than the number of stages.

This patch includes work from @chengjunlu
2023-12-10 22:32:11 -08:00

187 lines
7.7 KiB
MLIR

// RUN: mlir-opt %s --transform-interpreter -canonicalize --split-input-file --verify-diagnostics | FileCheck %s
func.func @simple_depth_2_unpeeled(%global: memref<?xf32>, %result: memref<?xf32> ) {
%c0 = arith.constant 0 : index
%c100 = arith.constant 100 : index
%c4 = arith.constant 4 : index
%shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
%c0f = arith.constant 0.0 : f32
// Predication is not currently implemented for transfer_read/write, so this is expected to fail.
// expected-note @below {{couldn't predicate}}
scf.for %i = %c0 to %c100 step %c4 iter_args(%accum = %c0f) -> f32 {
%mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
%0 = arith.addf %accum, %accum : f32
scf.yield %0 : f32
}
return
}
!t = !transform.any_op
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
// expected-error @below {{irreversible pipelining failure}}
// expected-note @below {{try setting "peel_epilogue"}}
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
transform.yield
}
}
// -----
// Loop pipeliner is tested separately, just verify the overall shape of the IR here.
func.func private @body(index, memref<?xf32, #gpu.address_space<workgroup>>)
// CHECK-LABEL: @simple_depth_2_peeled
// CHECK-SAME: %[[ARG:.+]]: memref
func.func @simple_depth_2_peeled(%global: memref<?xf32>) {
%c0 = arith.constant 0 : index
%c100 = arith.constant 100 : index
%c200 = arith.constant 200 : index
%c4 = arith.constant 4 : index
// CHECK: memref.alloc
%shared = memref.alloc(%c200) : memref<?xf32, #gpu.address_space<workgroup>>
%c0f = arith.constant 0.0 : f32
// CHECK: %[[LOADED1:.+]] = vector.transfer_read %[[ARG]]
// CHECK: %[[LOADED2:.+]] = vector.transfer_read %[[ARG]]
// CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[IA1:.+]] = %[[LOADED1]], %[[IA2:.+]] = %[[LOADED2]])
// CHECK: vector.transfer_write %[[IA1]]
// CHECK: func.call @body
// CHECK: %[[LOCAL_LOADED:.+]] = vector.transfer_read %[[ARG]]
// CHECK: scf.yield %[[IA2]], %[[LOCAL_LOADED]]
scf.for %i = %c0 to %c100 step %c4 {
%mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
func.call @body(%i, %shared) : (index, memref<?xf32, #gpu.address_space<workgroup>>) -> ()
}
// CHECK: vector.transfer_write %[[LOOP]]#0
// CHECK: call @body
// CHECK: vector.transfer_write %[[LOOP]]#1
// CHECK: call @body
return
}
!t = !transform.any_op
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
transform.yield
}
}
// -----
// CHECK-LABEL: @async_depth_2_predicated
// CHECK-SAME: %[[GLOBAL:.+]]: memref
func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index) {
%c0 = arith.constant 0 : index
%c98 = arith.constant 98 : index
%c100 = arith.constant 100 : index
// CHECK-DAG: %[[C4:.+]] = arith.constant 4
// CHECK-DAG: %[[C90:.+]] = arith.constant 90
// CHECK-DAG: %[[C96:.+]] = arith.constant 96
// CHECK-DAG: %[[C8:.+]] = arith.constant 8
// CHECK-DAG: %[[C2:.+]] = arith.constant 2
// CHECK-DAG: %[[C0:.+]] = arith.constant 0
%c4 = arith.constant 4 : index
// CHECK: %[[SHARED:.+]] = memref.alloc{{.*}} #gpu.address_space<workgroup>
%shared = memref.alloc(%alloc_size) : memref<?xf32, #gpu.address_space<workgroup>>
%c0f = arith.constant 0.0 : f32
// CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
// CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
// CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
// CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
// CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
scf.for %i = %c0 to %c98 step %c4 {
// Condition for the predication "select" below.
// CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
// CHECK: nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
// Original "select" with updated induction variable.
// CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
// CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
// CHECK: %[[SELECTED0:.+]] = arith.select %[[CMP1]], %[[C4]], %[[C2]]
%c96 = arith.constant 96 : index
%cond = arith.cmpi slt, %i, %c96 : index
%c2 = arith.constant 2 : index
%read_size = arith.select %cond, %c4, %c2 : index
// Updated induction variables (two more) for the device_async_copy below.
// These are generated repeatedly by the pipeliner.
// CHECK: %[[I_PLUS_8_2:.+]] = arith.addi %[[I]], %[[C8]]
// CHECK: %[[I_PLUS_8_3:.+]] = arith.addi %[[I]], %[[C8]]
// The second "select" is generated by predication and selects 0 for
// the two last iterations.
// CHECK: %[[SELECTED1:.+]] = arith.select %[[CMP0]], %[[SELECTED0]], %[[C0]]
// CHECK: %[[ASYNC_TOKEN:.+]] = nvgpu.device_async_copy %[[GLOBAL]][%[[I_PLUS_8_3]]], %[[SHARED]][%[[I_PLUS_8_2]]], 4, %[[SELECTED1]]
%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
nvgpu.device_async_wait %token
// CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
}
// There is no need to wait for the last copies as it it was fully predicated
// out and doesn't load the original data.
// CHECK-NOT: nvgpu.device_async_wait
return
}
!t = !transform.any_op
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
transform.yield
}
}
// -----
// CHECK-LABEL: @async_depth_2_peeled
func.func @async_depth_2_peeled(%global: memref<?xf32>) {
%c0 = arith.constant 0 : index
%c98 = arith.constant 98 : index
%c100 = arith.constant 100 : index
%c4 = arith.constant 4 : index
%shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
%c0f = arith.constant 0.0 : f32
// CHECK: nvgpu.device_async_copy
// CHECK: nvgpu.device_async_copy
// CHECK: scf.for
// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
// CHECK: arith.select
// CHECK: nvgpu.device_async_copy
// CHECK: scf.yield
// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
scf.for %i = %c0 to %c98 step %c4 {
%c96 = arith.constant 96 : index
%cond = arith.cmpi slt, %i, %c96 : index
%c2 = arith.constant 2 : index
%read_size = arith.select %cond, %c4, %c2 : index
%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
nvgpu.device_async_wait %token
}
return
}
!t = !transform.any_op
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
transform.yield
}
}