Allow directly storing an immediate instead of requiring that it first be moved into a register. This makes for more compact and readable PTX. An approach similar to this (using a ComplexPattern) this could be used for most PTX instructions to avoid the need for `_[ri]+` variants and boiler-plate.
197 lines
6.4 KiB
LLVM
197 lines
6.4 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX32
|
|
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s --check-prefix=PTX64
|
|
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
|
|
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -verify-machineinstrs | %ptxas-verify %}
|
|
|
|
; Ensure we access the local stack properly
|
|
|
|
define void @foo(i32 %a) {
|
|
; PTX32-LABEL: foo(
|
|
; PTX32: {
|
|
; PTX32-NEXT: .local .align 4 .b8 __local_depot0[4];
|
|
; PTX32-NEXT: .reg .b32 %SP;
|
|
; PTX32-NEXT: .reg .b32 %SPL;
|
|
; PTX32-NEXT: .reg .b32 %r<4>;
|
|
; PTX32-EMPTY:
|
|
; PTX32-NEXT: // %bb.0:
|
|
; PTX32-NEXT: mov.b32 %SPL, __local_depot0;
|
|
; PTX32-NEXT: ld.param.b32 %r1, [foo_param_0];
|
|
; PTX32-NEXT: add.u32 %r3, %SPL, 0;
|
|
; PTX32-NEXT: st.local.b32 [%r3], %r1;
|
|
; PTX32-NEXT: ret;
|
|
;
|
|
; PTX64-LABEL: foo(
|
|
; PTX64: {
|
|
; PTX64-NEXT: .local .align 4 .b8 __local_depot0[4];
|
|
; PTX64-NEXT: .reg .b64 %SP;
|
|
; PTX64-NEXT: .reg .b64 %SPL;
|
|
; PTX64-NEXT: .reg .b32 %r<2>;
|
|
; PTX64-NEXT: .reg .b64 %rd<3>;
|
|
; PTX64-EMPTY:
|
|
; PTX64-NEXT: // %bb.0:
|
|
; PTX64-NEXT: mov.b64 %SPL, __local_depot0;
|
|
; PTX64-NEXT: ld.param.b32 %r1, [foo_param_0];
|
|
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
|
|
; PTX64-NEXT: st.local.b32 [%rd2], %r1;
|
|
; PTX64-NEXT: ret;
|
|
%local = alloca i32, align 4
|
|
store volatile i32 %a, ptr %local
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel void @foo2(i32 %a) {
|
|
; PTX32-LABEL: foo2(
|
|
; PTX32: {
|
|
; PTX32-NEXT: .local .align 4 .b8 __local_depot1[4];
|
|
; PTX32-NEXT: .reg .b32 %SP;
|
|
; PTX32-NEXT: .reg .b32 %SPL;
|
|
; PTX32-NEXT: .reg .b32 %r<4>;
|
|
; PTX32-EMPTY:
|
|
; PTX32-NEXT: // %bb.0:
|
|
; PTX32-NEXT: mov.b32 %SPL, __local_depot1;
|
|
; PTX32-NEXT: cvta.local.u32 %SP, %SPL;
|
|
; PTX32-NEXT: ld.param.b32 %r1, [foo2_param_0];
|
|
; PTX32-NEXT: add.u32 %r2, %SP, 0;
|
|
; PTX32-NEXT: add.u32 %r3, %SPL, 0;
|
|
; PTX32-NEXT: st.local.b32 [%r3], %r1;
|
|
; PTX32-NEXT: { // callseq 0, 0
|
|
; PTX32-NEXT: .param .b32 param0;
|
|
; PTX32-NEXT: st.param.b32 [param0], %r2;
|
|
; PTX32-NEXT: call.uni bar, (param0);
|
|
; PTX32-NEXT: } // callseq 0
|
|
; PTX32-NEXT: ret;
|
|
;
|
|
; PTX64-LABEL: foo2(
|
|
; PTX64: {
|
|
; PTX64-NEXT: .local .align 4 .b8 __local_depot1[4];
|
|
; PTX64-NEXT: .reg .b64 %SP;
|
|
; PTX64-NEXT: .reg .b64 %SPL;
|
|
; PTX64-NEXT: .reg .b32 %r<2>;
|
|
; PTX64-NEXT: .reg .b64 %rd<3>;
|
|
; PTX64-EMPTY:
|
|
; PTX64-NEXT: // %bb.0:
|
|
; PTX64-NEXT: mov.b64 %SPL, __local_depot1;
|
|
; PTX64-NEXT: cvta.local.u64 %SP, %SPL;
|
|
; PTX64-NEXT: ld.param.b32 %r1, [foo2_param_0];
|
|
; PTX64-NEXT: add.u64 %rd1, %SP, 0;
|
|
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
|
|
; PTX64-NEXT: st.local.b32 [%rd2], %r1;
|
|
; PTX64-NEXT: { // callseq 0, 0
|
|
; PTX64-NEXT: .param .b64 param0;
|
|
; PTX64-NEXT: st.param.b64 [param0], %rd1;
|
|
; PTX64-NEXT: call.uni bar, (param0);
|
|
; PTX64-NEXT: } // callseq 0
|
|
; PTX64-NEXT: ret;
|
|
%local = alloca i32, align 4
|
|
store i32 %a, ptr %local
|
|
call void @bar(ptr %local)
|
|
ret void
|
|
}
|
|
|
|
declare void @bar(ptr %a)
|
|
|
|
define void @foo3(i32 %a) {
|
|
; PTX32-LABEL: foo3(
|
|
; PTX32: {
|
|
; PTX32-NEXT: .local .align 4 .b8 __local_depot2[12];
|
|
; PTX32-NEXT: .reg .b32 %SP;
|
|
; PTX32-NEXT: .reg .b32 %SPL;
|
|
; PTX32-NEXT: .reg .b32 %r<6>;
|
|
; PTX32-EMPTY:
|
|
; PTX32-NEXT: // %bb.0:
|
|
; PTX32-NEXT: mov.b32 %SPL, __local_depot2;
|
|
; PTX32-NEXT: ld.param.b32 %r1, [foo3_param_0];
|
|
; PTX32-NEXT: add.u32 %r3, %SPL, 0;
|
|
; PTX32-NEXT: shl.b32 %r4, %r1, 2;
|
|
; PTX32-NEXT: add.s32 %r5, %r3, %r4;
|
|
; PTX32-NEXT: st.local.b32 [%r5], %r1;
|
|
; PTX32-NEXT: ret;
|
|
;
|
|
; PTX64-LABEL: foo3(
|
|
; PTX64: {
|
|
; PTX64-NEXT: .local .align 4 .b8 __local_depot2[12];
|
|
; PTX64-NEXT: .reg .b64 %SP;
|
|
; PTX64-NEXT: .reg .b64 %SPL;
|
|
; PTX64-NEXT: .reg .b32 %r<2>;
|
|
; PTX64-NEXT: .reg .b64 %rd<5>;
|
|
; PTX64-EMPTY:
|
|
; PTX64-NEXT: // %bb.0:
|
|
; PTX64-NEXT: mov.b64 %SPL, __local_depot2;
|
|
; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0];
|
|
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
|
|
; PTX64-NEXT: mul.wide.s32 %rd3, %r1, 4;
|
|
; PTX64-NEXT: add.s64 %rd4, %rd2, %rd3;
|
|
; PTX64-NEXT: st.local.b32 [%rd4], %r1;
|
|
; PTX64-NEXT: ret;
|
|
%local = alloca [3 x i32], align 4
|
|
%1 = getelementptr inbounds i32, ptr %local, i32 %a
|
|
store i32 %a, ptr %1
|
|
ret void
|
|
}
|
|
|
|
define void @foo4() {
|
|
; PTX32-LABEL: foo4(
|
|
; PTX32: {
|
|
; PTX32-NEXT: .local .align 4 .b8 __local_depot3[8];
|
|
; PTX32-NEXT: .reg .b32 %SP;
|
|
; PTX32-NEXT: .reg .b32 %SPL;
|
|
; PTX32-NEXT: .reg .b32 %r<5>;
|
|
; PTX32-EMPTY:
|
|
; PTX32-NEXT: // %bb.0:
|
|
; PTX32-NEXT: mov.b32 %SPL, __local_depot3;
|
|
; PTX32-NEXT: cvta.local.u32 %SP, %SPL;
|
|
; PTX32-NEXT: add.u32 %r1, %SP, 0;
|
|
; PTX32-NEXT: add.u32 %r2, %SPL, 0;
|
|
; PTX32-NEXT: add.u32 %r3, %SP, 4;
|
|
; PTX32-NEXT: add.u32 %r4, %SPL, 4;
|
|
; PTX32-NEXT: st.local.b32 [%r2], 0;
|
|
; PTX32-NEXT: st.local.b32 [%r4], 0;
|
|
; PTX32-NEXT: { // callseq 1, 0
|
|
; PTX32-NEXT: .param .b32 param0;
|
|
; PTX32-NEXT: st.param.b32 [param0], %r1;
|
|
; PTX32-NEXT: call.uni bar, (param0);
|
|
; PTX32-NEXT: } // callseq 1
|
|
; PTX32-NEXT: { // callseq 2, 0
|
|
; PTX32-NEXT: .param .b32 param0;
|
|
; PTX32-NEXT: st.param.b32 [param0], %r3;
|
|
; PTX32-NEXT: call.uni bar, (param0);
|
|
; PTX32-NEXT: } // callseq 2
|
|
; PTX32-NEXT: ret;
|
|
;
|
|
; PTX64-LABEL: foo4(
|
|
; PTX64: {
|
|
; PTX64-NEXT: .local .align 4 .b8 __local_depot3[8];
|
|
; PTX64-NEXT: .reg .b64 %SP;
|
|
; PTX64-NEXT: .reg .b64 %SPL;
|
|
; PTX64-NEXT: .reg .b64 %rd<5>;
|
|
; PTX64-EMPTY:
|
|
; PTX64-NEXT: // %bb.0:
|
|
; PTX64-NEXT: mov.b64 %SPL, __local_depot3;
|
|
; PTX64-NEXT: cvta.local.u64 %SP, %SPL;
|
|
; PTX64-NEXT: add.u64 %rd1, %SP, 0;
|
|
; PTX64-NEXT: add.u64 %rd2, %SPL, 0;
|
|
; PTX64-NEXT: add.u64 %rd3, %SP, 4;
|
|
; PTX64-NEXT: add.u64 %rd4, %SPL, 4;
|
|
; PTX64-NEXT: st.local.b32 [%rd2], 0;
|
|
; PTX64-NEXT: st.local.b32 [%rd4], 0;
|
|
; PTX64-NEXT: { // callseq 1, 0
|
|
; PTX64-NEXT: .param .b64 param0;
|
|
; PTX64-NEXT: st.param.b64 [param0], %rd1;
|
|
; PTX64-NEXT: call.uni bar, (param0);
|
|
; PTX64-NEXT: } // callseq 1
|
|
; PTX64-NEXT: { // callseq 2, 0
|
|
; PTX64-NEXT: .param .b64 param0;
|
|
; PTX64-NEXT: st.param.b64 [param0], %rd3;
|
|
; PTX64-NEXT: call.uni bar, (param0);
|
|
; PTX64-NEXT: } // callseq 2
|
|
; PTX64-NEXT: ret;
|
|
%A = alloca i32
|
|
%B = alloca i32
|
|
store i32 0, ptr %A
|
|
store i32 0, ptr %B
|
|
call void @bar(ptr %A)
|
|
call void @bar(ptr %B)
|
|
ret void
|
|
}
|