This change consolidates and cleans up various NVPTXISD target-specific nodes in order to simplify SDAG ISel. While there are some whitespace changes in the emitted PTX it is otherwise a non-functional change. NVPTXISD::Wrapper - This node was used to wrap external-symbol and global-address nodes. It is redundant and has been removed. Instead we use the non-target versions of these nodes and convert them appropriately during ISel. NVPTXISD::CALL - Much of the family of nodes used to represent a PTX call instruction have been replaced by this new single node. It corresponds to a single instruction and is therefore much simpler to create and lower.
595 lines
26 KiB
LLVM
595 lines
26 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
|
; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
|
|
; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
|
|
|
|
%struct.uint4 = type { i32, i32, i32, i32 }
|
|
|
|
@gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16
|
|
|
|
; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
|
|
; Regular functions mus still make a copy. `cvta.param` does not always work there.
|
|
define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 {
|
|
; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
|
|
; OPT-SAME: ptr noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 captures(none) [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
|
|
; OPT-NEXT: [[ENTRY:.*:]]
|
|
; OPT-NEXT: [[A_:%.*]] = select i1 [[B]], ptr [[A]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
|
|
; OPT-NEXT: [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
|
|
; OPT-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
|
|
; OPT-NEXT: [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1
|
|
; OPT-NEXT: ret i32 [[TMP0]]
|
|
;
|
|
; PTX-LABEL: non_kernel_function(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .pred %p<2>;
|
|
; PTX-NEXT: .reg .b16 %rs<3>;
|
|
; PTX-NEXT: .reg .b32 %r<11>;
|
|
; PTX-NEXT: .reg .b64 %rd<8>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0: // %entry
|
|
; PTX-NEXT: mov.b64 %rd1, non_kernel_function_param_0;
|
|
; PTX-NEXT: cvta.local.u64 %rd2, %rd1;
|
|
; PTX-NEXT: ld.param.b8 %rs1, [non_kernel_function_param_1];
|
|
; PTX-NEXT: and.b16 %rs2, %rs1, 1;
|
|
; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
|
; PTX-NEXT: mov.b64 %rd3, gi;
|
|
; PTX-NEXT: cvta.global.u64 %rd4, %rd3;
|
|
; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1;
|
|
; PTX-NEXT: ld.param.s32 %rd6, [non_kernel_function_param_2];
|
|
; PTX-NEXT: add.s64 %rd7, %rd5, %rd6;
|
|
; PTX-NEXT: ld.b8 %r1, [%rd7];
|
|
; PTX-NEXT: ld.b8 %r2, [%rd7+1];
|
|
; PTX-NEXT: shl.b32 %r3, %r2, 8;
|
|
; PTX-NEXT: or.b32 %r4, %r3, %r1;
|
|
; PTX-NEXT: ld.b8 %r5, [%rd7+2];
|
|
; PTX-NEXT: shl.b32 %r6, %r5, 16;
|
|
; PTX-NEXT: ld.b8 %r7, [%rd7+3];
|
|
; PTX-NEXT: shl.b32 %r8, %r7, 24;
|
|
; PTX-NEXT: or.b32 %r9, %r8, %r6;
|
|
; PTX-NEXT: or.b32 %r10, %r9, %r4;
|
|
; PTX-NEXT: st.param.b32 [func_retval0], %r10;
|
|
; PTX-NEXT: ret;
|
|
entry:
|
|
%a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17
|
|
%idx.ext = sext i32 %c to i64, !dbg !18
|
|
%add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18
|
|
%0 = load i32, ptr %add.ptr, align 1, !dbg !19
|
|
ret i32 %0, !dbg !23
|
|
}
|
|
|
|
define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
|
|
; PTX-LABEL: grid_const_int(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<4>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [grid_const_int_param_2];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
|
|
; PTX-NEXT: ld.param.b32 %r1, [grid_const_int_param_1];
|
|
; PTX-NEXT: ld.param.b32 %r2, [grid_const_int_param_0];
|
|
; PTX-NEXT: add.s32 %r3, %r2, %r1;
|
|
; PTX-NEXT: st.global.b32 [%rd2], %r3;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_int(
|
|
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[INPUT11:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
|
|
; OPT-NEXT: [[TMP:%.*]] = load i32, ptr addrspace(101) [[INPUT11]], align 4
|
|
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[INPUT2]]
|
|
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
|
|
; OPT-NEXT: ret void
|
|
%tmp = load i32, ptr %input1, align 4
|
|
%add = add i32 %tmp, %input2
|
|
store i32 %add, ptr %out
|
|
ret void
|
|
}
|
|
|
|
%struct.s = type { i32, i32 }
|
|
|
|
define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, ptr %out){
|
|
; PTX-LABEL: grid_const_struct(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<4>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b64 %rd1, [grid_const_struct_param_1];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1;
|
|
; PTX-NEXT: ld.param.b32 %r1, [grid_const_struct_param_0];
|
|
; PTX-NEXT: ld.param.b32 %r2, [grid_const_struct_param_0+4];
|
|
; PTX-NEXT: add.s32 %r3, %r1, %r2;
|
|
; PTX-NEXT: st.global.b32 [%rd2], %r3;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_struct(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[INPUT1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[GEP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 0
|
|
; OPT-NEXT: [[GEP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr addrspace(101) [[INPUT1]], i32 0, i32 1
|
|
; OPT-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(101) [[GEP13]], align 4
|
|
; OPT-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(101) [[GEP22]], align 4
|
|
; OPT-NEXT: [[ADD:%.*]] = add i32 [[TMP1]], [[TMP2]]
|
|
; OPT-NEXT: store i32 [[ADD]], ptr [[OUT]], align 4
|
|
; OPT-NEXT: ret void
|
|
%gep1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
|
|
%gep2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
|
|
%int1 = load i32, ptr %gep1
|
|
%int2 = load i32, ptr %gep2
|
|
%add = add i32 %int1, %int2
|
|
store i32 %add, ptr %out
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
|
|
; PTX-LABEL: grid_const_escape(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<3>;
|
|
; PTX-NEXT: .reg .b64 %rd<4>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd2, grid_const_escape_param_0;
|
|
; PTX-NEXT: cvta.param.u64 %rd3, %rd2;
|
|
; PTX-NEXT: mov.b64 %rd1, escape;
|
|
; PTX-NEXT: { // callseq 0, 0
|
|
; PTX-NEXT: .param .b64 param0;
|
|
; PTX-NEXT: st.param.b64 [param0], %rd3;
|
|
; PTX-NEXT: .param .b32 retval0;
|
|
; PTX-NEXT: prototype_0 : .callprototype (.param .b32 _) _ (.param .b64 _);
|
|
; PTX-NEXT: call (retval0), %rd1, (param0), prototype_0;
|
|
; PTX-NEXT: ld.param.b32 %r1, [retval0];
|
|
; PTX-NEXT: } // callseq 0
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_escape(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT_PARAM_GEN]])
|
|
; OPT-NEXT: ret void
|
|
%call = call i32 @escape(ptr %input)
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
|
|
; PTX-LABEL: multiple_grid_const_escape(
|
|
; PTX: {
|
|
; PTX-NEXT: .local .align 4 .b8 __local_depot4[4];
|
|
; PTX-NEXT: .reg .b64 %SP;
|
|
; PTX-NEXT: .reg .b64 %SPL;
|
|
; PTX-NEXT: .reg .b32 %r<4>;
|
|
; PTX-NEXT: .reg .b64 %rd<8>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %SPL, __local_depot4;
|
|
; PTX-NEXT: cvta.local.u64 %SP, %SPL;
|
|
; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0;
|
|
; PTX-NEXT: ld.param.b32 %r1, [multiple_grid_const_escape_param_1];
|
|
; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2;
|
|
; PTX-NEXT: cvta.param.u64 %rd4, %rd3;
|
|
; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
|
|
; PTX-NEXT: add.u64 %rd6, %SP, 0;
|
|
; PTX-NEXT: add.u64 %rd7, %SPL, 0;
|
|
; PTX-NEXT: st.local.b32 [%rd7], %r1;
|
|
; PTX-NEXT: mov.b64 %rd1, escape3;
|
|
; PTX-NEXT: { // callseq 1, 0
|
|
; PTX-NEXT: .param .b64 param0;
|
|
; PTX-NEXT: st.param.b64 [param0], %rd5;
|
|
; PTX-NEXT: .param .b64 param1;
|
|
; PTX-NEXT: st.param.b64 [param1], %rd6;
|
|
; PTX-NEXT: .param .b64 param2;
|
|
; PTX-NEXT: st.param.b64 [param2], %rd4;
|
|
; PTX-NEXT: .param .b32 retval0;
|
|
; PTX-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b64 _, .param .b64 _, .param .b64 _);
|
|
; PTX-NEXT: call (retval0), %rd1, (param0, param1, param2), prototype_1;
|
|
; PTX-NEXT: ld.param.b32 %r2, [retval0];
|
|
; PTX-NEXT: } // callseq 1
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @multiple_grid_const_escape(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], i32 [[A:%.*]], ptr byval(i32) align 4 [[B:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[B]])
|
|
; OPT-NEXT: [[B_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
|
|
; OPT-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
|
|
; OPT-NEXT: store i32 [[A]], ptr [[A_ADDR]], align 4
|
|
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape3(ptr [[INPUT_PARAM_GEN]], ptr [[A_ADDR]], ptr [[B_PARAM_GEN]])
|
|
; OPT-NEXT: ret void
|
|
%a.addr = alloca i32, align 4
|
|
store i32 %a, ptr %a.addr, align 4
|
|
%call = call i32 @escape3(ptr %input, ptr %a.addr, ptr %b)
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %input, ptr %addr) {
|
|
; PTX-LABEL: grid_const_memory_escape(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<5>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_memory_escape_param_1];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
|
|
; PTX-NEXT: cvta.param.u64 %rd4, %rd1;
|
|
; PTX-NEXT: st.global.b64 [%rd3], %rd4;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: store ptr [[INPUT1]], ptr [[ADDR]], align 8
|
|
; OPT-NEXT: ret void
|
|
store ptr %input, ptr %addr, align 8
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 %input, ptr %result) {
|
|
; PTX-LABEL: grid_const_inlineasm_escape(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd5, [grid_const_inlineasm_escape_param_1];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5;
|
|
; PTX-NEXT: cvta.param.u64 %rd2, %rd4;
|
|
; PTX-NEXT: add.s64 %rd3, %rd2, 4;
|
|
; PTX-NEXT: // begin inline asm
|
|
; PTX-NEXT: add.s64 %rd1, %rd2, %rd3;
|
|
; PTX-NEXT: // end inline asm
|
|
; PTX-NEXT: st.global.b64 [%rd6], %rd1;
|
|
; PTX-NEXT: ret;
|
|
; PTX-NOT .local
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[RESULT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[TMPPTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
|
|
; OPT-NEXT: [[TMPPTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
|
|
; OPT-NEXT: [[TMP2:%.*]] = call i64 asm "add.s64 $0, $1, $2
|
|
; OPT-NEXT: store i64 [[TMP2]], ptr [[RESULT]], align 8
|
|
; OPT-NEXT: ret void
|
|
%tmpptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
|
|
%tmpptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
|
|
%1 = call i64 asm "add.s64 $0, $1, $2;", "=l,l,l"(ptr %tmpptr1, ptr %tmpptr2) #1
|
|
store i64 %1, ptr %result, align 8
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %output) {
|
|
; PTX-LABEL: grid_const_partial_escape(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<5>;
|
|
; PTX-NEXT: .reg .b64 %rd<6>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escape_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd3, [grid_const_partial_escape_param_1];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3;
|
|
; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
|
|
; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escape_param_0];
|
|
; PTX-NEXT: add.s32 %r2, %r1, %r1;
|
|
; PTX-NEXT: st.global.b32 [%rd4], %r2;
|
|
; PTX-NEXT: mov.b64 %rd1, escape;
|
|
; PTX-NEXT: { // callseq 2, 0
|
|
; PTX-NEXT: .param .b64 param0;
|
|
; PTX-NEXT: st.param.b64 [param0], %rd5;
|
|
; PTX-NEXT: .param .b32 retval0;
|
|
; PTX-NEXT: prototype_2 : .callprototype (.param .b32 _) _ (.param .b64 _);
|
|
; PTX-NEXT: call (retval0), %rd1, (param0), prototype_2;
|
|
; PTX-NEXT: ld.param.b32 %r3, [retval0];
|
|
; PTX-NEXT: } // callseq 2
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_partial_escape(
|
|
; OPT-SAME: ptr byval(i32) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT1_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[INPUT1_GEN]], align 4
|
|
; OPT-NEXT: [[TWICE:%.*]] = add i32 [[VAL1]], [[VAL1]]
|
|
; OPT-NEXT: store i32 [[TWICE]], ptr [[OUTPUT]], align 4
|
|
; OPT-NEXT: [[CALL:%.*]] = call i32 @escape(ptr [[INPUT1_GEN]])
|
|
; OPT-NEXT: ret void
|
|
%val = load i32, ptr %input
|
|
%twice = add i32 %val, %val
|
|
store i32 %twice, ptr %output
|
|
%call = call i32 @escape(ptr %input)
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ptr %output) {
|
|
; PTX-LABEL: grid_const_partial_escapemem(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<6>;
|
|
; PTX-NEXT: .reg .b64 %rd<6>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escapemem_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd3, [grid_const_partial_escapemem_param_1];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3;
|
|
; PTX-NEXT: cvta.param.u64 %rd5, %rd2;
|
|
; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escapemem_param_0];
|
|
; PTX-NEXT: ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4];
|
|
; PTX-NEXT: st.global.b64 [%rd4], %rd5;
|
|
; PTX-NEXT: add.s32 %r3, %r1, %r2;
|
|
; PTX-NEXT: mov.b64 %rd1, escape;
|
|
; PTX-NEXT: { // callseq 3, 0
|
|
; PTX-NEXT: .param .b64 param0;
|
|
; PTX-NEXT: st.param.b64 [param0], %rd5;
|
|
; PTX-NEXT: .param .b32 retval0;
|
|
; PTX-NEXT: prototype_3 : .callprototype (.param .b32 _) _ (.param .b64 _);
|
|
; PTX-NEXT: call (retval0), %rd1, (param0), prototype_3;
|
|
; PTX-NEXT: ld.param.b32 %r4, [retval0];
|
|
; PTX-NEXT: } // callseq 3
|
|
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel i32 @grid_const_partial_escapemem(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) [[INPUT:%.*]], ptr [[OUTPUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 0
|
|
; OPT-NEXT: [[VAL1:%.*]] = load i32, ptr [[PTR1]], align 4
|
|
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1]], i32 0, i32 1
|
|
; OPT-NEXT: [[VAL2:%.*]] = load i32, ptr [[PTR2]], align 4
|
|
; OPT-NEXT: store ptr [[INPUT1]], ptr [[OUTPUT]], align 8
|
|
; OPT-NEXT: [[ADD:%.*]] = add i32 [[VAL1]], [[VAL2]]
|
|
; OPT-NEXT: [[CALL2:%.*]] = call i32 @escape(ptr [[PTR1]])
|
|
; OPT-NEXT: ret i32 [[ADD]]
|
|
%ptr1 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 0
|
|
%val1 = load i32, ptr %ptr1
|
|
%ptr2 = getelementptr inbounds %struct.s, ptr %input, i32 0, i32 1
|
|
%val2 = load i32, ptr %ptr2
|
|
store ptr %input, ptr %output
|
|
%add = add i32 %val1, %val2
|
|
%call2 = call i32 @escape(ptr %ptr1)
|
|
ret i32 %add
|
|
}
|
|
|
|
define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
|
|
; PTX-LABEL: grid_const_phi(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .pred %p<2>;
|
|
; PTX-NEXT: .reg .b32 %r<3>;
|
|
; PTX-NEXT: .reg .b64 %rd<7>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd5, [grid_const_phi_param_1];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5;
|
|
; PTX-NEXT: ld.global.b32 %r1, [%rd1];
|
|
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
|
|
; PTX-NEXT: @%p1 bra $L__BB9_2;
|
|
; PTX-NEXT: // %bb.1: // %second
|
|
; PTX-NEXT: add.s64 %rd6, %rd6, 4;
|
|
; PTX-NEXT: $L__BB9_2: // %merge
|
|
; PTX-NEXT: ld.param.b32 %r2, [%rd6];
|
|
; PTX-NEXT: st.global.b32 [%rd1], %r2;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_phi(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
|
|
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
|
|
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
|
|
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
|
|
; OPT: [[FIRST]]:
|
|
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
|
|
; OPT-NEXT: br label %[[MERGE:.*]]
|
|
; OPT: [[SECOND]]:
|
|
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 1
|
|
; OPT-NEXT: br label %[[MERGE]]
|
|
; OPT: [[MERGE]]:
|
|
; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
|
|
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
|
|
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
|
|
; OPT-NEXT: ret void
|
|
|
|
%val = load i32, ptr %inout
|
|
%less = icmp slt i32 %val, 0
|
|
br i1 %less, label %first, label %second
|
|
first:
|
|
%ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
|
|
br label %merge
|
|
second:
|
|
%ptr2 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 1
|
|
br label %merge
|
|
merge:
|
|
%ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
|
|
%valloaded = load i32, ptr %ptrnew
|
|
store i32 %valloaded, ptr %inout
|
|
ret void
|
|
}
|
|
|
|
; NOTE: %input2 is *not* grid_constant
|
|
define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(%struct.s) %input2, ptr %inout) {
|
|
; PTX-LABEL: grid_const_phi_ngc(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .pred %p<2>;
|
|
; PTX-NEXT: .reg .b32 %r<3>;
|
|
; PTX-NEXT: .reg .b64 %rd<8>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd6, [grid_const_phi_ngc_param_2];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6;
|
|
; PTX-NEXT: ld.global.b32 %r1, [%rd1];
|
|
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
|
|
; PTX-NEXT: @%p1 bra $L__BB10_2;
|
|
; PTX-NEXT: // %bb.1: // %second
|
|
; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1;
|
|
; PTX-NEXT: add.s64 %rd7, %rd2, 4;
|
|
; PTX-NEXT: $L__BB10_2: // %merge
|
|
; PTX-NEXT: ld.param.b32 %r2, [%rd7];
|
|
; PTX-NEXT: st.global.b32 [%rd1], %r2;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc(
|
|
; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
|
|
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
|
|
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
|
|
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
|
|
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
|
|
; OPT-NEXT: br i1 [[LESS]], label %[[FIRST:.*]], label %[[SECOND:.*]]
|
|
; OPT: [[FIRST]]:
|
|
; OPT-NEXT: [[PTR1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT1_PARAM_GEN]], i32 0, i32 0
|
|
; OPT-NEXT: br label %[[MERGE:.*]]
|
|
; OPT: [[SECOND]]:
|
|
; OPT-NEXT: [[PTR2:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[INPUT2_PARAM_GEN]], i32 0, i32 1
|
|
; OPT-NEXT: br label %[[MERGE]]
|
|
; OPT: [[MERGE]]:
|
|
; OPT-NEXT: [[PTRNEW:%.*]] = phi ptr [ [[PTR1]], %[[FIRST]] ], [ [[PTR2]], %[[SECOND]] ]
|
|
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
|
|
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
|
|
; OPT-NEXT: ret void
|
|
%val = load i32, ptr %inout
|
|
%less = icmp slt i32 %val, 0
|
|
br i1 %less, label %first, label %second
|
|
first:
|
|
%ptr1 = getelementptr inbounds %struct.s, ptr %input1, i32 0, i32 0
|
|
br label %merge
|
|
second:
|
|
%ptr2 = getelementptr inbounds %struct.s, ptr %input2, i32 0, i32 1
|
|
br label %merge
|
|
merge:
|
|
%ptrnew = phi ptr [%ptr1, %first], [%ptr2, %second]
|
|
%valloaded = load i32, ptr %ptrnew
|
|
store i32 %valloaded, ptr %inout
|
|
ret void
|
|
}
|
|
|
|
; NOTE: %input2 is *not* grid_constant
|
|
define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr byval(i32) %input2, ptr %inout) {
|
|
; PTX-LABEL: grid_const_select(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .pred %p<2>;
|
|
; PTX-NEXT: .reg .b32 %r<3>;
|
|
; PTX-NEXT: .reg .b64 %rd<6>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0;
|
|
; PTX-NEXT: ld.param.b64 %rd2, [grid_const_select_param_2];
|
|
; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2;
|
|
; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1;
|
|
; PTX-NEXT: ld.global.b32 %r1, [%rd3];
|
|
; PTX-NEXT: setp.lt.s32 %p1, %r1, 0;
|
|
; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1;
|
|
; PTX-NEXT: ld.param.b32 %r2, [%rd5];
|
|
; PTX-NEXT: st.global.b32 [%rd3], %r2;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel void @grid_const_select(
|
|
; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[TMP1:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT2]])
|
|
; OPT-NEXT: [[INPUT2_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP1]] to ptr
|
|
; OPT-NEXT: [[TMP2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT1]])
|
|
; OPT-NEXT: [[INPUT1_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[TMP2]] to ptr
|
|
; OPT-NEXT: [[VAL:%.*]] = load i32, ptr [[INOUT]], align 4
|
|
; OPT-NEXT: [[LESS:%.*]] = icmp slt i32 [[VAL]], 0
|
|
; OPT-NEXT: [[PTRNEW:%.*]] = select i1 [[LESS]], ptr [[INPUT1_PARAM_GEN]], ptr [[INPUT2_PARAM_GEN]]
|
|
; OPT-NEXT: [[VALLOADED:%.*]] = load i32, ptr [[PTRNEW]], align 4
|
|
; OPT-NEXT: store i32 [[VALLOADED]], ptr [[INOUT]], align 4
|
|
; OPT-NEXT: ret void
|
|
%val = load i32, ptr %inout
|
|
%less = icmp slt i32 %val, 0
|
|
%ptrnew = select i1 %less, ptr %input1, ptr %input2
|
|
%valloaded = load i32, ptr %ptrnew
|
|
store i32 %valloaded, ptr %inout
|
|
ret void
|
|
}
|
|
|
|
define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) {
|
|
; PTX-LABEL: grid_const_ptrtoint(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<4>;
|
|
; PTX-NEXT: .reg .b64 %rd<3>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0;
|
|
; PTX-NEXT: ld.param.b32 %r1, [grid_const_ptrtoint_param_0];
|
|
; PTX-NEXT: cvta.param.u64 %rd2, %rd1;
|
|
; PTX-NEXT: cvt.u32.u64 %r2, %rd2;
|
|
; PTX-NEXT: add.s32 %r3, %r1, %r2;
|
|
; PTX-NEXT: st.param.b32 [func_retval0], %r3;
|
|
; PTX-NEXT: ret;
|
|
; OPT-LABEL: define ptx_kernel i32 @grid_const_ptrtoint(
|
|
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[INPUT2:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT3:%.*]] = load i32, ptr addrspace(101) [[INPUT2]], align 4
|
|
; OPT-NEXT: [[INPUT1:%.*]] = addrspacecast ptr addrspace(101) [[INPUT2]] to ptr
|
|
; OPT-NEXT: [[PTRVAL:%.*]] = ptrtoint ptr [[INPUT1]] to i32
|
|
; OPT-NEXT: [[KEEPALIVE:%.*]] = add i32 [[INPUT3]], [[PTRVAL]]
|
|
; OPT-NEXT: ret i32 [[KEEPALIVE]]
|
|
%val = load i32, ptr %input
|
|
%ptrval = ptrtoint ptr %input to i32
|
|
%keepalive = add i32 %val, %ptrval
|
|
ret i32 %keepalive
|
|
}
|
|
|
|
declare void @device_func(ptr byval(i32) align 4)
|
|
|
|
define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) {
|
|
; OPT-LABEL: define ptx_kernel void @test_forward_byval_arg(
|
|
; OPT-SAME: ptr byval(i32) align 4 [[INPUT:%.*]]) #[[ATTR0]] {
|
|
; OPT-NEXT: [[INPUT_PARAM:%.*]] = call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr [[INPUT]])
|
|
; OPT-NEXT: [[INPUT_PARAM_GEN:%.*]] = addrspacecast ptr addrspace(101) [[INPUT_PARAM]] to ptr
|
|
; OPT-NEXT: call void @device_func(ptr byval(i32) align 4 [[INPUT_PARAM_GEN]])
|
|
; OPT-NEXT: ret void
|
|
;
|
|
; PTX-LABEL: test_forward_byval_arg(
|
|
; PTX: {
|
|
; PTX-NEXT: .reg .b32 %r<2>;
|
|
; PTX-EMPTY:
|
|
; PTX-NEXT: // %bb.0:
|
|
; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0];
|
|
; PTX-NEXT: { // callseq 4, 0
|
|
; PTX-NEXT: .param .align 4 .b8 param0[4];
|
|
; PTX-NEXT: st.param.b32 [param0], %r1;
|
|
; PTX-NEXT: call.uni device_func, (param0);
|
|
; PTX-NEXT: } // callseq 4
|
|
; PTX-NEXT: ret;
|
|
call void @device_func(ptr byval(i32) align 4 %input)
|
|
ret void
|
|
}
|
|
|
|
|
|
declare dso_local void @dummy() local_unnamed_addr
|
|
declare dso_local ptr @escape(ptr) local_unnamed_addr
|
|
declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
|
|
|
|
!nvvm.annotations = !{!0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24}
|
|
|
|
!0 = !{ptr @grid_const_int, !"grid_constant", !1}
|
|
!1 = !{i32 1}
|
|
|
|
!2 = !{ptr @grid_const_struct, !"grid_constant", !3}
|
|
!3 = !{i32 1}
|
|
|
|
!4 = !{ptr @grid_const_escape, !"grid_constant", !5}
|
|
!5 = !{i32 1}
|
|
|
|
!6 = !{ptr @multiple_grid_const_escape, !"grid_constant", !7}
|
|
!7 = !{i32 1, i32 3}
|
|
|
|
!8 = !{ptr @grid_const_memory_escape, !"grid_constant", !9}
|
|
!9 = !{i32 1}
|
|
|
|
!10 = !{ptr @grid_const_inlineasm_escape, !"grid_constant", !11}
|
|
!11 = !{i32 1}
|
|
|
|
!12 = !{ptr @grid_const_partial_escape, !"grid_constant", !13}
|
|
!13 = !{i32 1}
|
|
|
|
!14 = !{ptr @grid_const_partial_escapemem, !"grid_constant", !15}
|
|
!15 = !{i32 1}
|
|
|
|
!16 = !{ptr @grid_const_phi, !"grid_constant", !17}
|
|
!17 = !{i32 1}
|
|
|
|
!18 = !{ptr @grid_const_phi_ngc, !"grid_constant", !19}
|
|
!19 = !{i32 1}
|
|
|
|
!20 = !{ptr @grid_const_select, !"grid_constant", !21}
|
|
!21 = !{i32 1}
|
|
|
|
!22 = !{ptr @grid_const_ptrtoint, !"grid_constant", !23}
|
|
!23 = !{i32 1}
|
|
|
|
!24 = !{ptr @test_forward_byval_arg, !"grid_constant", !25}
|
|
!25 = !{i32 1}
|
|
|