This patch adds an intrinsic for setmaxnreg PTX instruction. * PTX Doc link for this instruction: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#miscellaneous-instructions-setmaxnreg * The i32 argument, an immediate value, specifies the actual absolute register count for the instruction. * The `setmaxnreg` instruction is available in SM90a. So, this patch adds 'hasSM90a' predicate to use in the NVPTX backend. * lit tests are added to verify the lowering of the intrinsic. * Verifier logic (and tests) are added to test the register count range and divisibility-by-8 requirements. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
17 lines
651 B
LLVM
17 lines
651 B
LLVM
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80| FileCheck --check-prefixes=CHECK %s
|
|
; RUN: %if ptxas-12.0 %{ llc < %s -march=nvptx64 -mcpu=sm_90a -mattr=+ptx80| %ptxas-verify -arch=sm_90a %}
|
|
|
|
declare void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 %reg_count)
|
|
declare void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 %reg_count)
|
|
|
|
; CHECK-LABEL: test_set_maxn_reg
|
|
define void @test_set_maxn_reg() {
|
|
; CHECK: setmaxnreg.inc.sync.aligned.u32 96;
|
|
call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 96)
|
|
|
|
; CHECK: setmaxnreg.dec.sync.aligned.u32 64;
|
|
call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 64)
|
|
|
|
ret void
|
|
}
|