When converting affine.for to GPU launch operator, we have to calculate the block dimension and thread dimension for the launch operator. The formula of the dimension size is (upper_bound - lower_bound) / step_size When the difference is indivisible by step_size, we use rounding-to-zero as the division result. However, the block dimension and thread dimension is right-open range, i.e., [0, block_dim) and [0, thread_dim). So, we will get the wrong result if we use DivSIOp. In this patch, we replace it with CeilDivSIOp to get the correct block and thread dimension values.
29 lines
1.4 KiB
MLIR
29 lines
1.4 KiB
MLIR
// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-affine-for-to-gpu{gpu-block-dims=1 gpu-thread-dims=1}))" %s | FileCheck %s
|
|
|
|
// CHECK-LABEL: @step_var
|
|
func.func @step_var(%A : memref<?x?xf32>, %B : memref<?x?xf32>) {
|
|
// Check that we divide by step.
|
|
// CHECK: %[[range_i:.*]] = arith.ceildivsi {{.*}}, %{{.*}}
|
|
// CHECK: %[[range_j:.*]] = arith.ceildivsi {{.*}}, %{{.*}}
|
|
|
|
// CHECK: gpu.launch
|
|
// CHECK-SAME: blocks(%{{[^)]*}}, %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[range_i]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
|
|
// CHECK-SAME: threads(%{{[^)]*}}, %{{[^)]*}}, %{{[^)]*}}) in (%{{[^)]*}} = %[[range_j]], %{{[^)]*}} = %{{[^)]*}}, %{{[^)]*}} = %{{[^)]*}})
|
|
affine.for %i = 5 to 15 step 4 {
|
|
affine.for %j = 3 to 19 step 7 {
|
|
// Loop induction variable remapping:
|
|
// iv = thread(block)_id * step + lower_bound
|
|
// CHECK: %[[prod_i:.*]] = arith.muli %{{.*}}, %{{.*}} : index
|
|
// CHECK-NEXT: %[[i:.*]] = arith.addi %{{.*}}, %[[prod_i]] : index
|
|
// CHECK-NEXT: %[[prod_j:.*]] = arith.muli %{{.*}}, %{{.*}} : index
|
|
// CHECK-NEXT: %[[j:.*]] = arith.addi %{{.*}}, %[[prod_j]] : index
|
|
|
|
// CHECK: {{.*}} = memref.load %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
|
|
%0 = memref.load %A[%i, %j] : memref<?x?xf32>
|
|
// CHECK: memref.store {{.*}}, %{{.*}}[%[[i]], %[[j]]] : memref<?x?xf32>
|
|
memref.store %0, %B[%i, %j] : memref<?x?xf32>
|
|
}
|
|
}
|
|
return
|
|
}
|