[MLIR][NVVM] Add TMA linear prefetch Op (#141211)

This patch adds an Op for the TMA prefetch
(non-tensor) variant. llvm-lit tests are added
to verify the lowering to the intrinsics.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
Durgadoss R
2025-05-26 15:31:12 +05:30
committed by GitHub
parent 38cec041d3
commit b038dc24f3
3 changed files with 72 additions and 0 deletions

View File

@@ -2344,6 +2344,49 @@ def NVVM_PrefetchTensorMapOp : NVVM_Op<"prefetch.tensormap",
}];
}
def NVVM_CpAsyncBulkPrefetchOp : NVVM_Op<"cp.async.bulk.prefetch"> {
let summary = "Async bulk prefetch from global memory to L2 cache";
let description = [{
Initiates an asynchronous prefetch of data from the location
specified by `srcMem` to the L2 cache.
The `l2CacheHint` operand is optional, and it is used to specify cache
eviction policy that may be used during the memory access.
Example:
```mlir
nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
// with l2_cache_hint
nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>
```
[For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch)
}];
let arguments = (ins
LLVM_PointerGlobal:$srcMem,
I32:$size,
Optional<I64>:$l2CacheHint);
let assemblyFormat = [{
$srcMem `,` $size (`l2_cache_hint` `=` $l2CacheHint^ )?
attr-dict `:` type($srcMem)
}];
let extraClassDeclaration = [{
static mlir::NVVM::IDArgPair
getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
llvm::IRBuilderBase& builder);
}];
string llvmBuilder = [{
auto [id, args] = NVVM::CpAsyncBulkPrefetchOp::getIntrinsicIDAndArgs(
*op, moduleTranslation, builder);
createIntrinsicCall(builder, id, args);
}];
}
def NVVM_CpAsyncBulkTensorPrefetchOp :
NVVM_Op<"cp.async.bulk.tensor.prefetch", [AttrSizedOperandSegments]> {
let arguments = (ins

View File

@@ -1254,6 +1254,26 @@ CpAsyncOp::getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt,
return id;
}
mlir::NVVM::IDArgPair CpAsyncBulkPrefetchOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::CpAsyncBulkPrefetchOp>(op);
llvm::SmallVector<llvm::Value *> args;
llvm::Intrinsic::ID id = llvm::Intrinsic::nvvm_cp_async_bulk_prefetch_L2;
// Fill the Intrinsic Args
args.push_back(mt.lookupValue(thisOp.getSrcMem()));
args.push_back(mt.lookupValue(thisOp.getSize()));
mlir::Value cacheHint = thisOp.getL2CacheHint();
const bool hasCacheHint = static_cast<bool>(cacheHint);
llvm::Value *i64Unused =
llvm::ConstantInt::get(llvm::Type::getInt64Ty(mt.getLLVMContext()), 0);
args.push_back(hasCacheHint ? mt.lookupValue(cacheHint) : i64Unused);
args.push_back(builder.getInt1(hasCacheHint));
return {id, std::move(args)};
}
mlir::NVVM::IDArgPair CpAsyncBulkSharedCTAToGlobalOp::getIntrinsicIDAndArgs(
Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
auto thisOp = cast<NVVM::CpAsyncBulkSharedCTAToGlobalOp>(op);

View File

@@ -1,5 +1,14 @@
// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
// CHECK-LABEL: @tma_bulk_prefetch
llvm.func @tma_bulk_prefetch(%src : !llvm.ptr<1>, %size : i32, %ch : i64) {
// CHECK: call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %{{.*}}, i32 %{{.*}}, i64 0, i1 false)
// CHECK: call void @llvm.nvvm.cp.async.bulk.prefetch.L2(ptr addrspace(1) %{{.*}}, i32 %{{.*}}, i64 %{{.*}}, i1 true)
nvvm.cp.async.bulk.prefetch %src, %size : !llvm.ptr<1>
nvvm.cp.async.bulk.prefetch %src, %size l2_cache_hint = %ch : !llvm.ptr<1>
llvm.return
}
// CHECK-LABEL: @tma_prefetch_1d
llvm.func @tma_prefetch_1d(%tma_desc : !llvm.ptr, %d0 : i32, %ch : i64) {
// CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %0, i32 %{{.*}}, i64 0, i1 false)