From c507a0830df2e4fd0c234eee035aac2109de6d6e Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Thu, 15 May 2025 16:08:01 +0530 Subject: [PATCH] [NVPTX] Add TMA Bulk Copy Intrinsics (#138679) This patch adds a new variant of TMA Bulk Copy intrinsics introduced in sm100+. This variant has an additional byte_mask to select the bytes for the copy operation. * Selection is all done through table-gen now. So, this patch removes the corresponding SelectCpAsyncBulkS2G() function. * lit tests are verified with a cuda-12.8 ptxas executable. PTX Spec link: https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-bulk-copy Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 10 ++-- llvm/include/llvm/IR/IntrinsicsNVVM.td | 13 ++++++ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 28 ----------- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h | 1 - llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 32 +++++++------ .../CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll | 46 +++++++++++++++++++ llvm/test/CodeGen/NVPTX/cp-async-bulk.ll | 6 +-- 7 files changed, 88 insertions(+), 48 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 51bbfd0a5c88..957cccc6268e 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -672,6 +672,7 @@ Syntax: .. code-block:: llvm declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(..., i32 %size, i64 %ch, i1 %flag_ch, i16 %mask) Overview: """"""""" @@ -680,10 +681,13 @@ The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX instructions. These instructions initiate an asynchronous copy from shared::cta to global memory. The 32-bit operand ``%size`` specifies -the amount of memory to be copied and it must be a multiple of 16. +the amount of memory to be copied (in bytes) and it must be a multiple +of 16. For the ``.bytemask`` variant, the 16-bit wide mask operand +specifies whether the i-th byte of each 16-byte wide chunk of source +data is copied to the destination. -* The last argument to these intrinsics is a boolean flag - indicating support for cache_hint. This flag argument must +* The ``i1 %flag_ch`` argument to these intrinsics is a boolean + flag indicating support for cache_hint. This flag argument must be a compile-time constant. When set, it indicates a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` variant of the PTX instruction. diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index a95c739f1331..67c47095076d 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2112,6 +2112,19 @@ def int_nvvm_cp_async_bulk_shared_cta_to_global WriteOnly>, ReadOnly>, NoCapture>, NoCapture>]>; +// From Shared CTA to Global memory with bytemask +def int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask + : DefaultAttrsIntrinsic<[], + [llvm_global_ptr_ty, // dst_gmem_ptr + llvm_shared_ptr_ty, // src_smem_ptr + llvm_i32_ty, // copy_size + llvm_i64_ty, // cache_hint + llvm_i1_ty, // Flag for cache_hint + llvm_i16_ty], // byte_mask + [IntrConvergent, IntrArgMemOnly, + WriteOnly>, ReadOnly>, + ImmArg>]>; + // Intrinsics for Bulk Copy Prefetch L2 def int_nvvm_cp_async_bulk_prefetch_L2 : DefaultAttrsIntrinsicFlags<[], diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 7d171cff7bcb..2247ae3cf8f4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2685,31 +2685,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } -void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // dst, src, size, cache_hint, cache_hint_flag - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {5} - size_t NumOps = N->getNumOperands(); - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumArgs)); - Ops.push_back(N->getOperand(0)); // Chain operand - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode; - if (IsCacheHint) - Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH - : NVPTX::CP_ASYNC_BULK_S2G_CH; - else - Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32 - : NVPTX::CP_ASYNC_BULK_S2G; - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) { // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: // {dst, mbar, src, size, multicast, cache_hint, @@ -2892,9 +2867,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: SelectCpAsyncBulkG2S(N); return true; - case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global: - SelectCpAsyncBulkS2G(N); - return true; case Intrinsic::nvvm_cp_async_bulk_prefetch_L2: SelectCpAsyncBulkPrefetchL2(N); return true; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 23cbd458571a..92efabc7e206 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -93,7 +93,6 @@ private: void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkG2S(SDNode *N); - void SelectCpAsyncBulkS2G(SDNode *N); void SelectCpAsyncBulkPrefetchL2(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index d3cfce76c666..4f8a798295b4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -511,10 +511,11 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ : // TMA Async Bulk Copy Functions //------------------------------ -class CpAsyncBulkStr { +class CpAsyncBulkStr { // Shared to Global memory string S2G = "cp.async.bulk.global.shared::cta.bulk_group" - # !if(ch, ".L2::cache_hint", ""); + # !if(ch, ".L2::cache_hint", "") + # !if(mask, ".cp_mask", ""); // Global to Shared cluster memory string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes" @@ -525,18 +526,23 @@ class CpAsyncBulkStr { string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes"; } -multiclass CP_ASYNC_BULK_S2G { - def NAME: NVPTXInst<(outs), - (ins Int64Regs:$dst, rc:$src, Int32Regs:$size), - !strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), - (ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch), - !strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; +multiclass CP_ASYNC_BULK_S2G_INTR { + def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + !if(has_ch, + CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;", + CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"), + [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>, + Requires<[hasPTX<80>, hasSM<90>]>; + + def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask), + !if(has_ch, + CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;", + CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"), + [(int_nvvm_cp_async_bulk_shared_cta_to_global_bytemask addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0), i16:$mask)]>, + Requires<[hasPTX<86>, hasSM<100>]>; } -defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G; -defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G; +defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G_INTR<0>; +defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR<1>; multiclass CP_ASYNC_BULK_G2S { def NAME: NVPTXInst<(outs), diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll new file mode 100644 index 000000000000..1e6b04635edd --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-s2g-sm100.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86| %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1), ptr addrspace(3), i32, i64, i1, i16) + +define void @cp_async_bulk_s2g_bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i16 %mask) { +; CHECK-PTX64-LABEL: cp_async_bulk_s2g_bytemask( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_2]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_bytemask_param_3]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%rd2], %r1, %rd3, %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%rd2], %r1, %rs1; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g_bytemask( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_bytemask_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_bytemask_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_bytemask_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_bytemask_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_s2g_bytemask_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint.cp_mask [%rd1], [%r1], %r2, %rd2, %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.cp_mask [%rd1], [%r1], %r2, %rs1; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1, i16 %mask) + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global.bytemask(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0, i16 %mask) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index 77694ac82459..d7f2a5df5547 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -66,8 +66,8 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 ; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2]; -; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; ; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -80,11 +80,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; - tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 0) tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1) ret void }