[NVPTX] Add elect.sync Intrinsic (#104780)

This patch adds an NVVM intrinsic and NVPTX codegen for the elect.sync
PTX instruction. Lit tests are
added in elect.ll and verified through ptxas.

PTX ISA reference:

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
Durgadoss R
2024-08-20 20:46:05 +05:30
committed by GitHub
parent 55d744eea3
commit d5e9691509
4 changed files with 109 additions and 1 deletions

View File

@@ -251,10 +251,36 @@ Overview:
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
Electing a thread
-----------------
'``llvm.nvvm.elect.sync``'
^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
"""""""
.. code-block:: llvm
declare {i32, i1} @llvm.nvvm.elect.sync(i32 %membermask)
Overview:
"""""""""
The '``@llvm.nvvm.elect.sync``' intrinsic generates the ``elect.sync``
PTX instruction, which elects one predicated active leader thread from
a set of threads specified by ``membermask``. The behavior is undefined
if the executing thread is not in ``membermask``. The laneid of the
elected thread is captured in the i32 return value. The i1 return
value is set to ``True`` for the leader thread and ``False`` for all
the other threads. Election of a leader thread happens deterministically,
i.e. the same leader thread is elected for the same ``membermask``
every time. For more information, refer PTX ISA
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync>`_.
Membar/Fences
-------------
'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@@ -4706,6 +4706,14 @@ def int_nvvm_match_all_sync_i64p :
Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
//
// ELECT.SYNC
//
// elect.sync dst|pred, membermask
def int_nvvm_elect_sync :
DefaultAttrsIntrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty],
[IntrInaccessibleMemOnly, IntrConvergent]>;
//
// REDUX.SYNC
//

View File

@@ -243,6 +243,16 @@ defm VOTE_SYNC_ANY : VOTE_SYNC<Int1Regs, "any.pred", int_nvvm_vote_any_sync>;
defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>;
defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>;
// elect.sync
def INT_ELECT_SYNC_I : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask),
"elect.sync \t$dest|$pred, $mask;",
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync imm:$mask))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
def INT_ELECT_SYNC_R : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask),
"elect.sync \t$dest|$pred, $mask;",
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync Int32Regs:$mask))]>,
Requires<[hasPTX<80>, hasSM<90>]>;
multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
Operand ImmOp> {
def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),

View File

@@ -0,0 +1,64 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
target triple = "nvptx64-nvidia-cuda"
declare {i32, i1} @llvm.nvvm.elect.sync(i32)
define {i32, i1} @elect_sync(i32 %mask) {
; CHECK-LABEL: elect_sync(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<3>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0];
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
; CHECK-NEXT: ret;
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
ret {i32, i1} %val
}
define {i32, i1} @elect_sync_imm() {
; CHECK-LABEL: elect_sync_imm(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<2>;
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<2>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: elect.sync %r1|%p1, -1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
; CHECK-NEXT: ret;
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 u0xffffffff)
ret {i32, i1} %val
}
; When there are two elect.sync's make sure that
; the second one is not optimized away.
define {i32, i1} @elect_sync_twice(i32 %mask) {
; CHECK-LABEL: elect_sync_twice(
; CHECK: {
; CHECK-NEXT: .reg .pred %p<3>;
; CHECK-NEXT: .reg .b16 %rs<2>;
; CHECK-NEXT: .reg .b32 %r<4>;
; CHECK-EMPTY:
; CHECK-NEXT: // %bb.0:
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0];
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
; CHECK-NEXT: elect.sync %r3|%p2, %r1;
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
; CHECK-NEXT: ret;
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
%val2 = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
ret {i32, i1} %val
}