[NVPTX] Add elect.sync Intrinsic (#104780)
This patch adds an NVVM intrinsic and NVPTX codegen for the elect.sync PTX instruction. Lit tests are added in elect.ll and verified through ptxas. PTX ISA reference: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
This commit is contained in:
@@ -251,10 +251,36 @@ Overview:
|
||||
The '``@llvm.nvvm.barrier0()``' intrinsic emits a PTX ``bar.sync 0``
|
||||
instruction, equivalent to the ``__syncthreads()`` call in CUDA.
|
||||
|
||||
Electing a thread
|
||||
-----------------
|
||||
|
||||
'``llvm.nvvm.elect.sync``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
Syntax:
|
||||
"""""""
|
||||
|
||||
.. code-block:: llvm
|
||||
|
||||
declare {i32, i1} @llvm.nvvm.elect.sync(i32 %membermask)
|
||||
|
||||
Overview:
|
||||
"""""""""
|
||||
|
||||
The '``@llvm.nvvm.elect.sync``' intrinsic generates the ``elect.sync``
|
||||
PTX instruction, which elects one predicated active leader thread from
|
||||
a set of threads specified by ``membermask``. The behavior is undefined
|
||||
if the executing thread is not in ``membermask``. The laneid of the
|
||||
elected thread is captured in the i32 return value. The i1 return
|
||||
value is set to ``True`` for the leader thread and ``False`` for all
|
||||
the other threads. Election of a leader thread happens deterministically,
|
||||
i.e. the same leader thread is elected for the same ``membermask``
|
||||
every time. For more information, refer PTX ISA
|
||||
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync>`_.
|
||||
|
||||
Membar/Fences
|
||||
-------------
|
||||
|
||||
|
||||
'``llvm.nvvm.fence.proxy.tensormap_generic.*``'
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
|
||||
@@ -4706,6 +4706,14 @@ def int_nvvm_match_all_sync_i64p :
|
||||
Intrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty, llvm_i64_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent, IntrNoCallback], "llvm.nvvm.match.all.sync.i64p">;
|
||||
|
||||
//
|
||||
// ELECT.SYNC
|
||||
//
|
||||
// elect.sync dst|pred, membermask
|
||||
def int_nvvm_elect_sync :
|
||||
DefaultAttrsIntrinsic<[llvm_i32_ty, llvm_i1_ty], [llvm_i32_ty],
|
||||
[IntrInaccessibleMemOnly, IntrConvergent]>;
|
||||
|
||||
//
|
||||
// REDUX.SYNC
|
||||
//
|
||||
|
||||
@@ -243,6 +243,16 @@ defm VOTE_SYNC_ANY : VOTE_SYNC<Int1Regs, "any.pred", int_nvvm_vote_any_sync>;
|
||||
defm VOTE_SYNC_UNI : VOTE_SYNC<Int1Regs, "uni.pred", int_nvvm_vote_uni_sync>;
|
||||
defm VOTE_SYNC_BALLOT : VOTE_SYNC<Int32Regs, "ballot.b32", int_nvvm_vote_ballot_sync>;
|
||||
|
||||
// elect.sync
|
||||
def INT_ELECT_SYNC_I : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins i32imm:$mask),
|
||||
"elect.sync \t$dest|$pred, $mask;",
|
||||
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync imm:$mask))]>,
|
||||
Requires<[hasPTX<80>, hasSM<90>]>;
|
||||
def INT_ELECT_SYNC_R : NVPTXInst<(outs Int32Regs:$dest, Int1Regs:$pred), (ins Int32Regs:$mask),
|
||||
"elect.sync \t$dest|$pred, $mask;",
|
||||
[(set Int32Regs:$dest, Int1Regs:$pred, (int_nvvm_elect_sync Int32Regs:$mask))]>,
|
||||
Requires<[hasPTX<80>, hasSM<90>]>;
|
||||
|
||||
multiclass MATCH_ANY_SYNC<NVPTXRegClass regclass, string ptxtype, Intrinsic IntOp,
|
||||
Operand ImmOp> {
|
||||
def ii : NVPTXInst<(outs Int32Regs:$dest), (ins i32imm:$mask, ImmOp:$value),
|
||||
|
||||
64
llvm/test/CodeGen/NVPTX/elect.ll
Normal file
64
llvm/test/CodeGen/NVPTX/elect.ll
Normal file
@@ -0,0 +1,64 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s
|
||||
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
|
||||
|
||||
target triple = "nvptx64-nvidia-cuda"
|
||||
|
||||
declare {i32, i1} @llvm.nvvm.elect.sync(i32)
|
||||
|
||||
define {i32, i1} @elect_sync(i32 %mask) {
|
||||
; CHECK-LABEL: elect_sync(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .pred %p<2>;
|
||||
; CHECK-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0];
|
||||
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
|
||||
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
|
||||
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
define {i32, i1} @elect_sync_imm() {
|
||||
; CHECK-LABEL: elect_sync_imm(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .pred %p<2>;
|
||||
; CHECK-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: elect.sync %r1|%p1, -1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r1;
|
||||
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
|
||||
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 u0xffffffff)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
|
||||
; When there are two elect.sync's make sure that
|
||||
; the second one is not optimized away.
|
||||
define {i32, i1} @elect_sync_twice(i32 %mask) {
|
||||
; CHECK-LABEL: elect_sync_twice(
|
||||
; CHECK: {
|
||||
; CHECK-NEXT: .reg .pred %p<3>;
|
||||
; CHECK-NEXT: .reg .b16 %rs<2>;
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0];
|
||||
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
|
||||
; CHECK-NEXT: elect.sync %r3|%p2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0+0], %r2;
|
||||
; CHECK-NEXT: selp.u16 %rs1, -1, 0, %p1;
|
||||
; CHECK-NEXT: st.param.b8 [func_retval0+4], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
|
||||
%val2 = call {i32, i1} @llvm.nvvm.elect.sync(i32 %mask)
|
||||
ret {i32, i1} %val
|
||||
}
|
||||
Reference in New Issue
Block a user