Support loops without static boundaries. Since the number of iteration is not known we need to predicate prologue and epilogue in case the number of iterations is smaller than the number of stages. This patch includes work from @chengjunlu
187 lines
7.7 KiB
MLIR
187 lines
7.7 KiB
MLIR
// RUN: mlir-opt %s --transform-interpreter -canonicalize --split-input-file --verify-diagnostics | FileCheck %s
|
|
|
|
func.func @simple_depth_2_unpeeled(%global: memref<?xf32>, %result: memref<?xf32> ) {
|
|
%c0 = arith.constant 0 : index
|
|
%c100 = arith.constant 100 : index
|
|
%c4 = arith.constant 4 : index
|
|
%shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
|
|
%c0f = arith.constant 0.0 : f32
|
|
// Predication is not currently implemented for transfer_read/write, so this is expected to fail.
|
|
// expected-note @below {{couldn't predicate}}
|
|
scf.for %i = %c0 to %c100 step %c4 iter_args(%accum = %c0f) -> f32 {
|
|
%mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
|
|
vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
|
|
%0 = arith.addf %accum, %accum : f32
|
|
scf.yield %0 : f32
|
|
}
|
|
return
|
|
}
|
|
|
|
!t = !transform.any_op
|
|
|
|
module attributes {transform.with_named_sequence} {
|
|
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
|
|
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
|
|
// expected-error @below {{irreversible pipelining failure}}
|
|
// expected-note @below {{try setting "peel_epilogue"}}
|
|
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
|
|
transform.yield
|
|
}
|
|
}
|
|
|
|
// -----
|
|
|
|
// Loop pipeliner is tested separately, just verify the overall shape of the IR here.
|
|
|
|
func.func private @body(index, memref<?xf32, #gpu.address_space<workgroup>>)
|
|
|
|
// CHECK-LABEL: @simple_depth_2_peeled
|
|
// CHECK-SAME: %[[ARG:.+]]: memref
|
|
func.func @simple_depth_2_peeled(%global: memref<?xf32>) {
|
|
%c0 = arith.constant 0 : index
|
|
%c100 = arith.constant 100 : index
|
|
%c200 = arith.constant 200 : index
|
|
%c4 = arith.constant 4 : index
|
|
// CHECK: memref.alloc
|
|
%shared = memref.alloc(%c200) : memref<?xf32, #gpu.address_space<workgroup>>
|
|
%c0f = arith.constant 0.0 : f32
|
|
// CHECK: %[[LOADED1:.+]] = vector.transfer_read %[[ARG]]
|
|
// CHECK: %[[LOADED2:.+]] = vector.transfer_read %[[ARG]]
|
|
// CHECK: %[[LOOP:.+]]:2 = scf.for {{.*}} iter_args(%[[IA1:.+]] = %[[LOADED1]], %[[IA2:.+]] = %[[LOADED2]])
|
|
// CHECK: vector.transfer_write %[[IA1]]
|
|
// CHECK: func.call @body
|
|
// CHECK: %[[LOCAL_LOADED:.+]] = vector.transfer_read %[[ARG]]
|
|
// CHECK: scf.yield %[[IA2]], %[[LOCAL_LOADED]]
|
|
scf.for %i = %c0 to %c100 step %c4 {
|
|
%mem = vector.transfer_read %global[%i], %c0f : memref<?xf32>, vector<4xf32>
|
|
vector.transfer_write %mem, %shared[%i] : vector<4xf32>, memref<?xf32, #gpu.address_space<workgroup>>
|
|
func.call @body(%i, %shared) : (index, memref<?xf32, #gpu.address_space<workgroup>>) -> ()
|
|
}
|
|
// CHECK: vector.transfer_write %[[LOOP]]#0
|
|
// CHECK: call @body
|
|
// CHECK: vector.transfer_write %[[LOOP]]#1
|
|
// CHECK: call @body
|
|
return
|
|
}
|
|
|
|
!t = !transform.any_op
|
|
|
|
module attributes {transform.with_named_sequence} {
|
|
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
|
|
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
|
|
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
|
|
transform.yield
|
|
}
|
|
}
|
|
|
|
// -----
|
|
|
|
// CHECK-LABEL: @async_depth_2_predicated
|
|
// CHECK-SAME: %[[GLOBAL:.+]]: memref
|
|
func.func @async_depth_2_predicated(%global: memref<?xf32>, %alloc_size: index) {
|
|
%c0 = arith.constant 0 : index
|
|
%c98 = arith.constant 98 : index
|
|
%c100 = arith.constant 100 : index
|
|
// CHECK-DAG: %[[C4:.+]] = arith.constant 4
|
|
// CHECK-DAG: %[[C90:.+]] = arith.constant 90
|
|
// CHECK-DAG: %[[C96:.+]] = arith.constant 96
|
|
// CHECK-DAG: %[[C8:.+]] = arith.constant 8
|
|
// CHECK-DAG: %[[C2:.+]] = arith.constant 2
|
|
// CHECK-DAG: %[[C0:.+]] = arith.constant 0
|
|
%c4 = arith.constant 4 : index
|
|
// CHECK: %[[SHARED:.+]] = memref.alloc{{.*}} #gpu.address_space<workgroup>
|
|
%shared = memref.alloc(%alloc_size) : memref<?xf32, #gpu.address_space<workgroup>>
|
|
%c0f = arith.constant 0.0 : f32
|
|
// CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy
|
|
// CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy
|
|
// CHECK: scf.for %[[I:.+]] = {{.*}} iter_args
|
|
// CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]]
|
|
// CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]]
|
|
scf.for %i = %c0 to %c98 step %c4 {
|
|
// Condition for the predication "select" below.
|
|
// CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]]
|
|
// CHECK: nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1
|
|
// Original "select" with updated induction variable.
|
|
// CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]]
|
|
// CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]]
|
|
// CHECK: %[[SELECTED0:.+]] = arith.select %[[CMP1]], %[[C4]], %[[C2]]
|
|
%c96 = arith.constant 96 : index
|
|
%cond = arith.cmpi slt, %i, %c96 : index
|
|
%c2 = arith.constant 2 : index
|
|
%read_size = arith.select %cond, %c4, %c2 : index
|
|
|
|
// Updated induction variables (two more) for the device_async_copy below.
|
|
// These are generated repeatedly by the pipeliner.
|
|
// CHECK: %[[I_PLUS_8_2:.+]] = arith.addi %[[I]], %[[C8]]
|
|
// CHECK: %[[I_PLUS_8_3:.+]] = arith.addi %[[I]], %[[C8]]
|
|
|
|
// The second "select" is generated by predication and selects 0 for
|
|
// the two last iterations.
|
|
// CHECK: %[[SELECTED1:.+]] = arith.select %[[CMP0]], %[[SELECTED0]], %[[C0]]
|
|
// CHECK: %[[ASYNC_TOKEN:.+]] = nvgpu.device_async_copy %[[GLOBAL]][%[[I_PLUS_8_3]]], %[[SHARED]][%[[I_PLUS_8_2]]], 4, %[[SELECTED1]]
|
|
%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
|
|
: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
|
|
|
|
nvgpu.device_async_wait %token
|
|
|
|
// CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]]
|
|
}
|
|
// There is no need to wait for the last copies as it it was fully predicated
|
|
// out and doesn't load the original data.
|
|
// CHECK-NOT: nvgpu.device_async_wait
|
|
return
|
|
}
|
|
|
|
|
|
!t = !transform.any_op
|
|
|
|
module attributes {transform.with_named_sequence} {
|
|
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
|
|
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
|
|
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2 } : (!t) -> !t
|
|
transform.yield
|
|
}
|
|
}
|
|
|
|
// -----
|
|
|
|
// CHECK-LABEL: @async_depth_2_peeled
|
|
func.func @async_depth_2_peeled(%global: memref<?xf32>) {
|
|
%c0 = arith.constant 0 : index
|
|
%c98 = arith.constant 98 : index
|
|
%c100 = arith.constant 100 : index
|
|
%c4 = arith.constant 4 : index
|
|
%shared = memref.alloc(%c100) : memref<?xf32, #gpu.address_space<workgroup>>
|
|
%c0f = arith.constant 0.0 : f32
|
|
// CHECK: nvgpu.device_async_copy
|
|
// CHECK: nvgpu.device_async_copy
|
|
// CHECK: scf.for
|
|
// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
|
|
// CHECK: arith.select
|
|
// CHECK: nvgpu.device_async_copy
|
|
// CHECK: scf.yield
|
|
// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1
|
|
// CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0
|
|
scf.for %i = %c0 to %c98 step %c4 {
|
|
%c96 = arith.constant 96 : index
|
|
%cond = arith.cmpi slt, %i, %c96 : index
|
|
%c2 = arith.constant 2 : index
|
|
%read_size = arith.select %cond, %c4, %c2 : index
|
|
%token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size
|
|
: memref<?xf32> to memref<?xf32, #gpu.address_space<workgroup>>
|
|
nvgpu.device_async_wait %token
|
|
}
|
|
return
|
|
}
|
|
|
|
|
|
!t = !transform.any_op
|
|
|
|
module attributes {transform.with_named_sequence} {
|
|
transform.named_sequence @__transform_main(%arg0: !t {transform.readonly}) {
|
|
%loop = transform.structured.match ops{["scf.for"]} in %arg0 : (!t) -> !t
|
|
transform.nvgpu.pipeline_shared_memory_copies failures(propagate) %loop { depth = 2, peel_epilogue } : (!t) -> !t
|
|
transform.yield
|
|
}
|
|
}
|