From d5e9691509aa774d1acedc4d3fa2d497757a2d6d Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Tue, 20 Aug 2024 20:46:05 +0530 Subject: [PATCH] [NVPTX] Add elect.sync Intrinsic (#104780) This patch adds an NVVM intrinsic and NVPTX codegen for the elect.sync PTX instruction. Lit tests are added in elect.ll and verified through ptxas. PTX ISA reference: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 28 ++++++++++- llvm/include/llvm/IR/IntrinsicsNVVM.td | 8 +++ llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 10 ++++ llvm/test/CodeGen/NVPTX/elect.ll | 64 ++++++++++++++++++++++++ 4 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/NVPTX/elect.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 872dedf8a82d..3a566bbac362 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -251,10 +251,36 @@ Overview: The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0`` instruction, equivalent to the ``__syncthreads()`` call in CUDA. +Electing a thread +----------------- + +'``llvm.nvvm.elect.sync``' +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare {i32, i1} @llvm.nvvm.elect.sync(i32 %membermask) + +Overview: +""""""""" + +The '``@llvm.nvvm.elect.sync``' intrinsic generates the ``elect.sync`` +PTX instruction, which elects one predicated active leader thread from +a set of threads specified by ``membermask``. The behavior is undefined +if the executing thread is not in ``membermask``. The laneid of the +elected thread is captured in the i32 return value. The i1 return +value is set to ``True`` for the leader thread and ``False`` for all +the other threads. Election of a leader thread happens deterministically, +i.e. the same leader thread is elected for the same ``membermask`` +every time. For more information, refer PTX ISA +``_. + Membar/Fences ------------- - '``llvm.nvvm.fence.proxy.tensormap_generic.*``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 8a333e7a559f..39685c920d94 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4706,6 +4706,14 @@ def int_nvvm_match_all_sync_i64p : Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty], [IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">; +// +// ELECT.SYNC +// +// elect.sync dst|pred, membermask +def int_nvvm_elect_sync : + DefaultAttrsIntrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty], + [IntrInaccessibleMemOnly, IntrConvergent]>; + // // REDUX.SYNC // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index a9116e15c367..371b4c213533 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -243,6 +243,16 @@ defm VOTE_SYNC_ANY : VOTE_SYNC; defm VOTE_SYNC_UNI : VOTE_SYNC; defm VOTE_SYNC_BALLOT : VOTE_SYNC; +// elect.sync +def INT_ELECT_SYNC_I : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask), + "elect.sync \t$dest|$pred, $mask;", + [(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync imm:$mask))]>, + Requires<[hasPTX<80>, hasSM<90>]>; +def INT_ELECT_SYNC_R : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask), + "elect.sync \t$dest|$pred, $mask;", + [(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync Int32Regs:$mask))]>, + Requires<[hasPTX<80>, hasSM<90>]>; + multiclass MATCH_ANY_SYNC { def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value), diff --git a/llvm/test/CodeGen/NVPTX/elect.ll b/llvm/test/CodeGen/NVPTX/elect.ll new file mode 100644 index 000000000000..358dfef91852 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/elect.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s +; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare {i32, i1} @llvm.nvvm.elect.sync(i32) + +define {i32, i1} @elect_sync(i32 %mask) { +; CHECK-LABEL: elect_sync( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0]; +; CHECK-NEXT: elect.sync %r2|%p1, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; +; CHECK-NEXT: ret; + %val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask) + ret {i32, i1} %val +} + +define {i32, i1} @elect_sync_imm() { +; CHECK-LABEL: elect_sync_imm( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<2>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: elect.sync %r1|%p1, -1; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1; +; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; +; CHECK-NEXT: ret; + %val = call {i32, i1} @llvm.nvvm.elect.sync(i32 u0xffffffff) + ret {i32, i1} %val +} + +; When there are two elect.sync's make sure that +; the second one is not optimized away. +define {i32, i1} @elect_sync_twice(i32 %mask) { +; CHECK-LABEL: elect_sync_twice( +; CHECK: { +; CHECK-NEXT: .reg .pred %p<3>; +; CHECK-NEXT: .reg .b16 %rs<2>; +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0]; +; CHECK-NEXT: elect.sync %r2|%p1, %r1; +; CHECK-NEXT: elect.sync %r3|%p2, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2; +; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1; +; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1; +; CHECK-NEXT: ret; + %val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask) + %val2 = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask) + ret {i32, i1} %val +}