[mlir] lower min/maxnum to libdevice calls (#127323)

Introduce lowering from arith.minnum/maxxnum operations to the
corresponding Nvidia libdevice calls. This requires to reorder pattern
population methods so that the libdevice-targeting patterns are
prioritized over default patterns targeting LLVM IR intrinsics from the
Arith dialect. The tests are placed into a separate file as the existing
gpu-to-nvvm.mlir files has a mode that forces Arith dialect operations
to be preserved as is without using a separate FileCheck tag to
differentiate.

Co-authored-by: William Moses <gh@wsmoses.com>
This commit is contained in:
Oleksandr "Alex" Zinenko
2025-02-15 23:53:36 +01:00
committed by GitHub
parent 256145b4b0
commit 963ff1c305
2 changed files with 28 additions and 1 deletions

View File

@@ -378,6 +378,8 @@ struct LowerGpuOpsToNVVMOpsPass final
RewritePatternSet llvmPatterns(m.getContext());
LLVMConversionTarget target(getContext());
populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
llvm::SmallDenseSet<StringRef> allowedDialectsSet(allowedDialects.begin(),
allowedDialects.end());
for (Dialect *dialect : getContext().getLoadedDialects()) {
@@ -407,7 +409,6 @@ struct LowerGpuOpsToNVVMOpsPass final
llvmPatterns);
}
populateGpuToNVVMConversionPatterns(converter, llvmPatterns);
populateGpuWMMAToNVVMConversionPatterns(converter, llvmPatterns);
if (this->hasRedux)
populateGpuSubgroupReduceOpLoweringPattern(converter, llvmPatterns);
@@ -552,6 +553,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
populateOpPatterns<arith::RemFOp>(converter, patterns, "__nv_fmodf",
"__nv_fmod");
populateOpPatterns<arith::MaxNumFOp>(converter, patterns, "__nv_fmaxf",
"__nv_fmax");
populateOpPatterns<arith::MinNumFOp>(converter, patterns, "__nv_fminf",
"__nv_fmin");
populateIntOpPatterns<math::AbsIOp>(converter, patterns, "__nv_abs");
populateOpPatterns<math::AbsFOp>(converter, patterns, "__nv_fabsf",
"__nv_fabs");

View File

@@ -0,0 +1,21 @@
// RUN: mlir-opt %s -convert-gpu-to-nvvm -split-input-file | FileCheck %s
gpu.module @test_module_54 {
// CHECK: llvm.func @__nv_fmaxf(f32, f32) -> f32
// CHECK: llvm.func @__nv_fminf(f32, f32) -> f32
// CHECK: llvm.func @__nv_fmax(f64, f64) -> f64
// CHECK: llvm.func @__nv_fmin(f64, f64) -> f64
// CHECK-LABEL: @gpu_fminmax
func.func @gpu_fminmax(%arg1_f32: f32, %arg2_f32: f32, %arg1_f64: f64, %arg2_f64: f64)
-> (f32, f32, f64, f64) {
// CHECK: llvm.call @__nv_fmaxf
%max_f32 = arith.maxnumf %arg1_f32, %arg2_f32 : f32
// CHECK: llvm.call @__nv_fminf
%min_f32 = arith.minnumf %arg1_f32, %arg2_f32 : f32
// CHECK: llvm.call @__nv_fmax(
%max_f64 = arith.maxnumf %arg1_f64, %arg2_f64 : f64
// CHECK: llvm.call @__nv_fmin(
%min_f64 = arith.minnumf %arg1_f64, %arg2_f64 : f64
return %max_f32, %min_f32, %max_f64, %min_f64 : f32, f32, f64, f64
}
}