Files
clang-p2996/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
Alex MacLean 369891b674 [NVPTX] use untyped loads and stores where ever possible (#137698)
In most cases, the type information attached to load and store
instructions is meaningless and inconsistently applied. We can usually
use ".b" loads and avoid the complexity of trying to assign the correct
type. The one expectation is sign-extending load, which will continue to
use ".s" to ensure the sign extension into a larger register is done
correctly.
2025-05-10 08:26:26 -07:00

757 lines
38 KiB
LLVM

; RUN: llc < %s -mtriple=nvptx64-unknown-unknown | FileCheck %s
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64-unknown-unknown | %ptxas-verify %}
;
; Check that parameters of a __device__ function with private or internal
; linkage called from a __global__ (kernel) function get increased alignment,
; and additional vectorization is performed on loads/stores with that
; parameters.
;
; Test IR is a minimized version of IR generated with the following command
; from the source code below:
; $ clang++ -O3 --cuda-gpu-arch=sm_35 -S -emit-llvm src.cu
;
; ----------------------------------------------------------------------------
; #include <stdint.h>
;
; struct St4x1 { uint32_t field[1]; };
; struct St4x2 { uint32_t field[2]; };
; struct St4x3 { uint32_t field[3]; };
; struct St4x4 { uint32_t field[4]; };
; struct St4x5 { uint32_t field[5]; };
; struct St4x6 { uint32_t field[6]; };
; struct St4x7 { uint32_t field[7]; };
; struct St4x8 { uint32_t field[8]; };
; struct St8x1 { uint64_t field[1]; };
; struct St8x2 { uint64_t field[2]; };
; struct St8x3 { uint64_t field[3]; };
; struct St8x4 { uint64_t field[4]; };
;
; #define DECLARE_CALLEE(StName) \
; static __device__ __attribute__((noinline)) \
; struct StName callee_##StName(struct StName in) { \
; struct StName ret; \
; const unsigned size = sizeof(ret.field) / sizeof(*ret.field); \
; for (unsigned i = 0; i != size; ++i) \
; ret.field[i] = in.field[i]; \
; return ret; \
; } \
; #define DECLARE_CALLER(StName) \
; __global__ \
; void caller_##StName(struct StName in, struct StName* ret) \
; { \
; *ret = callee_##StName(in); \
; } \
;
; #define DECLARE_CALL(StName) \
; DECLARE_CALLEE(StName) \
; DECLARE_CALLER(StName) \
;
; DECLARE_CALL(St4x1)
; DECLARE_CALL(St4x2)
; DECLARE_CALL(St4x3)
; DECLARE_CALL(St4x4)
; DECLARE_CALL(St4x5)
; DECLARE_CALL(St4x6)
; DECLARE_CALL(St4x7)
; DECLARE_CALL(St4x8)
; DECLARE_CALL(St8x1)
; DECLARE_CALL(St8x2)
; DECLARE_CALL(St8x3)
; DECLARE_CALL(St8x4)
; ----------------------------------------------------------------------------
%struct.St4x1 = type { [1 x i32] }
%struct.St4x2 = type { [2 x i32] }
%struct.St4x3 = type { [3 x i32] }
%struct.St4x4 = type { [4 x i32] }
%struct.St4x5 = type { [5 x i32] }
%struct.St4x6 = type { [6 x i32] }
%struct.St4x7 = type { [7 x i32] }
%struct.St4x8 = type { [8 x i32] }
%struct.St8x1 = type { [1 x i64] }
%struct.St8x2 = type { [2 x i64] }
%struct.St8x3 = type { [3 x i64] }
%struct.St8x4 = type { [4 x i64] }
; Section 1 - checking that:
; - function argument (including retval) vectorization is done with internal linkage;
; - caller and callee specify correct alignment for callee's params.
define dso_local void @caller_St4x1(ptr nocapture noundef readonly byval(%struct.St4x1) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x1(
; CHECK: .param .align 4 .b8 caller_St4x1_param_0[4],
; CHECK: .param .b64 caller_St4x1_param_1
; CHECK: )
; CHECK: .param .b32 param0;
; CHECK: st.param.b32 [param0], {{%r[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[4];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x1,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0];
%1 = load i32, ptr %in, align 4
%call = tail call fastcc [1 x i32] @callee_St4x1(i32 %1) #2
%.fca.0.extract = extractvalue [1 x i32] %call, 0
store i32 %.fca.0.extract, ptr %ret, align 4
ret void
}
define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) {
; CHECK: .func (.param .align 16 .b8 func_retval0[4])
; CHECK-LABEL: callee_St4x1(
; CHECK-NEXT: .param .b32 callee_St4x1_param_0
; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [callee_St4x1_param_0];
; CHECK: st.param.b32 [func_retval0], [[R1]];
; CHECK-NEXT: ret;
%oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0
ret [1 x i32] %oldret
}
define dso_local void @caller_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x2(
; CHECK: .param .align 4 .b8 caller_St4x2_param_0[8],
; CHECK: .param .b64 caller_St4x2_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[8];
; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x2,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
%agg.tmp = alloca %struct.St4x2, align 8
%1 = load i64, ptr %in, align 4
store i64 %1, ptr %agg.tmp, align 8
%call = tail call fastcc [2 x i32] @callee_St4x2(ptr noundef nonnull byval(%struct.St4x2) align 4 %agg.tmp) #2
%.fca.0.extract = extractvalue [2 x i32] %call, 0
%.fca.1.extract = extractvalue [2 x i32] %call, 1
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
ret void
}
define internal fastcc [2 x i32] @callee_St4x2(ptr nocapture noundef readonly byval(%struct.St4x2) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[8])
; CHECK-LABEL: callee_St4x2(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x2_param_0[8]
; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0];
; CHECK: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [2 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%3 = insertvalue [2 x i32] poison, i32 %1, 0
%oldret = insertvalue [2 x i32] %3, i32 %2, 1
ret [2 x i32] %oldret
}
define dso_local void @caller_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x3(
; CHECK: .param .align 4 .b8 caller_St4x3_param_0[12],
; CHECK: .param .b64 caller_St4x3_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[12];
; CHECK: st.param.v2.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.b32 [param0+8], {{%r[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[12];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x3,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+8];
%call = tail call fastcc [3 x i32] @callee_St4x3(ptr noundef nonnull byval(%struct.St4x3) align 4 %in) #2
%.fca.0.extract = extractvalue [3 x i32] %call, 0
%.fca.1.extract = extractvalue [3 x i32] %call, 1
%.fca.2.extract = extractvalue [3 x i32] %call, 2
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
ret void
}
define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly byval(%struct.St4x3) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0];
; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8];
; CHECK: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK: st.param.b32 [func_retval0+8], [[R3]];
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [3 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [3 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%4 = insertvalue [3 x i32] poison, i32 %1, 0
%5 = insertvalue [3 x i32] %4, i32 %2, 1
%oldret = insertvalue [3 x i32] %5, i32 %3, 2
ret [3 x i32] %oldret
}
define dso_local void @caller_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x4(
; CHECK: .param .align 4 .b8 caller_St4x4_param_0[16],
; CHECK: .param .b64 caller_St4x4_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[16];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x4,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
%call = tail call fastcc [4 x i32] @callee_St4x4(ptr noundef nonnull byval(%struct.St4x4) align 4 %in) #2
%.fca.0.extract = extractvalue [4 x i32] %call, 0
%.fca.1.extract = extractvalue [4 x i32] %call, 1
%.fca.2.extract = extractvalue [4 x i32] %call, 2
%.fca.3.extract = extractvalue [4 x i32] %call, 3
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
%ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
ret void
}
define internal fastcc [4 x i32] @callee_St4x4(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: callee_St4x4(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_param_0[16]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0];
; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%5 = insertvalue [4 x i32] poison, i32 %1, 0
%6 = insertvalue [4 x i32] %5, i32 %2, 1
%7 = insertvalue [4 x i32] %6, i32 %3, 2
%oldret = insertvalue [4 x i32] %7, i32 %4, 3
ret [4 x i32] %oldret
}
define dso_local void @caller_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x5(
; CHECK: .param .align 4 .b8 caller_St4x5_param_0[20],
; CHECK: .param .b64 caller_St4x5_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[20];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.b32 [param0+16], {{%r[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[20];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x5,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+16];
%call = tail call fastcc [5 x i32] @callee_St4x5(ptr noundef nonnull byval(%struct.St4x5) align 4 %in) #2
%.fca.0.extract = extractvalue [5 x i32] %call, 0
%.fca.1.extract = extractvalue [5 x i32] %call, 1
%.fca.2.extract = extractvalue [5 x i32] %call, 2
%.fca.3.extract = extractvalue [5 x i32] %call, 3
%.fca.4.extract = extractvalue [5 x i32] %call, 4
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
%ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
%ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
ret void
}
define internal fastcc [5 x i32] @callee_St4x5(ptr nocapture noundef readonly byval(%struct.St4x5) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[20])
; CHECK-LABEL: callee_St4x5(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x5_param_0[20]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0];
; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16];
; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK: st.param.b32 [func_retval0+16], [[R5]];
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%arrayidx.4 = getelementptr inbounds [5 x i32], ptr %in, i64 0, i64 4
%5 = load i32, ptr %arrayidx.4, align 4
%6 = insertvalue [5 x i32] poison, i32 %1, 0
%7 = insertvalue [5 x i32] %6, i32 %2, 1
%8 = insertvalue [5 x i32] %7, i32 %3, 2
%9 = insertvalue [5 x i32] %8, i32 %4, 3
%oldret = insertvalue [5 x i32] %9, i32 %5, 4
ret [5 x i32] %oldret
}
define dso_local void @caller_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x6(
; CHECK: .param .align 4 .b8 caller_St4x6_param_0[24],
; CHECK: .param .b64 caller_St4x6_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[24];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[24];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x6,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
%call = tail call fastcc [6 x i32] @callee_St4x6(ptr noundef nonnull byval(%struct.St4x6) align 4 %in) #2
%.fca.0.extract = extractvalue [6 x i32] %call, 0
%.fca.1.extract = extractvalue [6 x i32] %call, 1
%.fca.2.extract = extractvalue [6 x i32] %call, 2
%.fca.3.extract = extractvalue [6 x i32] %call, 3
%.fca.4.extract = extractvalue [6 x i32] %call, 4
%.fca.5.extract = extractvalue [6 x i32] %call, 5
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
%ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
%ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
%ref.tmp.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 20
store i32 %.fca.5.extract, ptr %ref.tmp.sroa.8.0..sroa_idx, align 4
ret void
}
define internal fastcc [6 x i32] @callee_St4x6(ptr nocapture noundef readonly byval(%struct.St4x6) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[24])
; CHECK-LABEL: callee_St4x6(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x6_param_0[24]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0];
; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16];
; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%arrayidx.4 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 4
%5 = load i32, ptr %arrayidx.4, align 4
%arrayidx.5 = getelementptr inbounds [6 x i32], ptr %in, i64 0, i64 5
%6 = load i32, ptr %arrayidx.5, align 4
%7 = insertvalue [6 x i32] poison, i32 %1, 0
%8 = insertvalue [6 x i32] %7, i32 %2, 1
%9 = insertvalue [6 x i32] %8, i32 %3, 2
%10 = insertvalue [6 x i32] %9, i32 %4, 3
%11 = insertvalue [6 x i32] %10, i32 %5, 4
%oldret = insertvalue [6 x i32] %11, i32 %6, 5
ret [6 x i32] %oldret
}
define dso_local void @caller_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x7(
; CHECK: .param .align 4 .b8 caller_St4x7_param_0[28],
; CHECK: .param .b64 caller_St4x7_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[28];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.v2.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.b32 [param0+24], {{%r[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[28];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x7,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.v2.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
; CHECK: ld.param.b32 {{%r[0-9]+}}, [retval0+24];
%call = tail call fastcc [7 x i32] @callee_St4x7(ptr noundef nonnull byval(%struct.St4x7) align 4 %in) #2
%.fca.0.extract = extractvalue [7 x i32] %call, 0
%.fca.1.extract = extractvalue [7 x i32] %call, 1
%.fca.2.extract = extractvalue [7 x i32] %call, 2
%.fca.3.extract = extractvalue [7 x i32] %call, 3
%.fca.4.extract = extractvalue [7 x i32] %call, 4
%.fca.5.extract = extractvalue [7 x i32] %call, 5
%.fca.6.extract = extractvalue [7 x i32] %call, 6
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
%ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
%ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
%ref.tmp.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 20
store i32 %.fca.5.extract, ptr %ref.tmp.sroa.8.0..sroa_idx, align 4
%ref.tmp.sroa.9.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 24
store i32 %.fca.6.extract, ptr %ref.tmp.sroa.9.0..sroa_idx, align 4
ret void
}
define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly byval(%struct.St4x7) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[28])
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16];
; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24];
; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
; CHECK: st.param.b32 [func_retval0+24], [[R7]];
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%arrayidx.4 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 4
%5 = load i32, ptr %arrayidx.4, align 4
%arrayidx.5 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 5
%6 = load i32, ptr %arrayidx.5, align 4
%arrayidx.6 = getelementptr inbounds [7 x i32], ptr %in, i64 0, i64 6
%7 = load i32, ptr %arrayidx.6, align 4
%8 = insertvalue [7 x i32] poison, i32 %1, 0
%9 = insertvalue [7 x i32] %8, i32 %2, 1
%10 = insertvalue [7 x i32] %9, i32 %3, 2
%11 = insertvalue [7 x i32] %10, i32 %4, 3
%12 = insertvalue [7 x i32] %11, i32 %5, 4
%13 = insertvalue [7 x i32] %12, i32 %6, 5
%oldret = insertvalue [7 x i32] %13, i32 %7, 6
ret [7 x i32] %oldret
}
define dso_local void @caller_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St4x8(
; CHECK: .param .align 4 .b8 caller_St4x8_param_0[32],
; CHECK: .param .b64 caller_St4x8_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[32];
; CHECK: st.param.v4.b32 [param0], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: st.param.v4.b32 [param0+16], {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[32];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St4x8,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0];
; CHECK: ld.param.v4.b32 {{{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}, {{%r[0-9]+}}}, [retval0+16];
%call = tail call fastcc [8 x i32] @callee_St4x8(ptr noundef nonnull byval(%struct.St4x8) align 4 %in) #2
%.fca.0.extract = extractvalue [8 x i32] %call, 0
%.fca.1.extract = extractvalue [8 x i32] %call, 1
%.fca.2.extract = extractvalue [8 x i32] %call, 2
%.fca.3.extract = extractvalue [8 x i32] %call, 3
%.fca.4.extract = extractvalue [8 x i32] %call, 4
%.fca.5.extract = extractvalue [8 x i32] %call, 5
%.fca.6.extract = extractvalue [8 x i32] %call, 6
%.fca.7.extract = extractvalue [8 x i32] %call, 7
store i32 %.fca.0.extract, ptr %ret, align 4
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 4
store i32 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 4
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i32 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 4
%ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 12
store i32 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 4
%ref.tmp.sroa.7.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
store i32 %.fca.4.extract, ptr %ref.tmp.sroa.7.0..sroa_idx, align 4
%ref.tmp.sroa.8.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 20
store i32 %.fca.5.extract, ptr %ref.tmp.sroa.8.0..sroa_idx, align 4
%ref.tmp.sroa.9.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 24
store i32 %.fca.6.extract, ptr %ref.tmp.sroa.9.0..sroa_idx, align 4
%ref.tmp.sroa.10.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 28
store i32 %.fca.7.extract, ptr %ref.tmp.sroa.10.0..sroa_idx, align 4
ret void
}
define internal fastcc [8 x i32] @callee_St4x8(ptr nocapture noundef readonly byval(%struct.St4x8) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[32])
; CHECK-LABEL: callee_St4x8(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x8_param_0[32]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0];
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16];
; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK: st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]};
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%arrayidx.4 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 4
%5 = load i32, ptr %arrayidx.4, align 4
%arrayidx.5 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 5
%6 = load i32, ptr %arrayidx.5, align 4
%arrayidx.6 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 6
%7 = load i32, ptr %arrayidx.6, align 4
%arrayidx.7 = getelementptr inbounds [8 x i32], ptr %in, i64 0, i64 7
%8 = load i32, ptr %arrayidx.7, align 4
%9 = insertvalue [8 x i32] poison, i32 %1, 0
%10 = insertvalue [8 x i32] %9, i32 %2, 1
%11 = insertvalue [8 x i32] %10, i32 %3, 2
%12 = insertvalue [8 x i32] %11, i32 %4, 3
%13 = insertvalue [8 x i32] %12, i32 %5, 4
%14 = insertvalue [8 x i32] %13, i32 %6, 5
%15 = insertvalue [8 x i32] %14, i32 %7, 6
%oldret = insertvalue [8 x i32] %15, i32 %8, 7
ret [8 x i32] %oldret
}
define dso_local void @caller_St8x1(ptr nocapture noundef readonly byval(%struct.St8x1) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St8x1(
; CHECK: .param .align 8 .b8 caller_St8x1_param_0[8],
; CHECK: .param .b64 caller_St8x1_param_1
; CHECK: )
; CHECK: .param .b64 param0;
; CHECK: st.param.b64 [param0], {{%rd[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[8];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St8x1,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0];
%1 = load i64, ptr %in, align 8
%call = tail call fastcc [1 x i64] @callee_St8x1(i64 %1) #2
%.fca.0.extract = extractvalue [1 x i64] %call, 0
store i64 %.fca.0.extract, ptr %ret, align 8
ret void
}
define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) {
; CHECK: .func (.param .align 16 .b8 func_retval0[8])
; CHECK-LABEL: callee_St8x1(
; CHECK-NEXT: .param .b64 callee_St8x1_param_0
; CHECK: ld.param.b64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0];
; CHECK: st.param.b64 [func_retval0], [[RD1]];
; CHECK-NEXT: ret;
%oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0
ret [1 x i64] %oldret
}
define dso_local void @caller_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St8x2(
; CHECK: .param .align 8 .b8 caller_St8x2_param_0[16],
; CHECK: .param .b64 caller_St8x2_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[16];
; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[16];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St8x2,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
%call = tail call fastcc [2 x i64] @callee_St8x2(ptr noundef nonnull byval(%struct.St8x2) align 8 %in) #2
%.fca.0.extract = extractvalue [2 x i64] %call, 0
%.fca.1.extract = extractvalue [2 x i64] %call, 1
store i64 %.fca.0.extract, ptr %ret, align 8
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i64 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 8
ret void
}
define internal fastcc [2 x i64] @callee_St8x2(ptr nocapture noundef readonly byval(%struct.St8x2) align 8 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: callee_St8x2(
; CHECK-NEXT: .param .align 16 .b8 callee_St8x2_param_0[16]
; CHECK: ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0];
; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]};
; CHECK-NEXT: ret;
%1 = load i64, ptr %in, align 8
%arrayidx.1 = getelementptr inbounds [2 x i64], ptr %in, i64 0, i64 1
%2 = load i64, ptr %arrayidx.1, align 8
%3 = insertvalue [2 x i64] poison, i64 %1, 0
%oldret = insertvalue [2 x i64] %3, i64 %2, 1
ret [2 x i64] %oldret
}
define dso_local void @caller_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St8x3(
; CHECK: .param .align 8 .b8 caller_St8x3_param_0[24],
; CHECK: .param .b64 caller_St8x3_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[24];
; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: st.param.b64 [param0+16], {{%rd[0-9]+}};
; CHECK: .param .align 16 .b8 retval0[24];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St8x3,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
; CHECK: ld.param.b64 {{%rd[0-9]+}}, [retval0+16];
%call = tail call fastcc [3 x i64] @callee_St8x3(ptr noundef nonnull byval(%struct.St8x3) align 8 %in) #2
%.fca.0.extract = extractvalue [3 x i64] %call, 0
%.fca.1.extract = extractvalue [3 x i64] %call, 1
%.fca.2.extract = extractvalue [3 x i64] %call, 2
store i64 %.fca.0.extract, ptr %ret, align 8
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i64 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 8
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
store i64 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 8
ret void
}
define internal fastcc [3 x i64] @callee_St8x3(ptr nocapture noundef readonly byval(%struct.St8x3) align 8 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[24])
; CHECK-LABEL: callee_St8x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St8x3_param_0[24]
; CHECK: ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0];
; CHECK: ld.param.b64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16];
; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]};
; CHECK: st.param.b64 [func_retval0+16], [[RD3]];
; CHECK-NEXT: ret;
%1 = load i64, ptr %in, align 8
%arrayidx.1 = getelementptr inbounds [3 x i64], ptr %in, i64 0, i64 1
%2 = load i64, ptr %arrayidx.1, align 8
%arrayidx.2 = getelementptr inbounds [3 x i64], ptr %in, i64 0, i64 2
%3 = load i64, ptr %arrayidx.2, align 8
%4 = insertvalue [3 x i64] poison, i64 %1, 0
%5 = insertvalue [3 x i64] %4, i64 %2, 1
%oldret = insertvalue [3 x i64] %5, i64 %3, 2
ret [3 x i64] %oldret
}
define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in, ptr nocapture noundef writeonly %ret) {
; CHECK-LABEL: .visible .func caller_St8x4(
; CHECK: .param .align 8 .b8 caller_St8x4_param_0[32],
; CHECK: .param .b64 caller_St8x4_param_1
; CHECK: )
; CHECK: .param .align 16 .b8 param0[32];
; CHECK: st.param.v2.b64 [param0], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: st.param.v2.b64 [param0+16], {{{%rd[0-9]+}}, {{%rd[0-9]+}}};
; CHECK: .param .align 16 .b8 retval0[32];
; CHECK: call.uni (retval0),
; CHECK-NEXT: callee_St8x4,
; CHECK-NEXT: (
; CHECK-NEXT: param0
; CHECK-NEXT: );
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0];
; CHECK: ld.param.v2.b64 {{{%rd[0-9]+}}, {{%rd[0-9]+}}}, [retval0+16];
%call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2
%.fca.0.extract = extractvalue [4 x i64] %call, 0
%.fca.1.extract = extractvalue [4 x i64] %call, 1
%.fca.2.extract = extractvalue [4 x i64] %call, 2
%.fca.3.extract = extractvalue [4 x i64] %call, 3
store i64 %.fca.0.extract, ptr %ret, align 8
%ref.tmp.sroa.4.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 8
store i64 %.fca.1.extract, ptr %ref.tmp.sroa.4.0..sroa_idx, align 8
%ref.tmp.sroa.5.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 16
store i64 %.fca.2.extract, ptr %ref.tmp.sroa.5.0..sroa_idx, align 8
%ref.tmp.sroa.6.0..sroa_idx = getelementptr inbounds i8, ptr %ret, i64 24
store i64 %.fca.3.extract, ptr %ref.tmp.sroa.6.0..sroa_idx, align 8
ret void
}
define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly byval(%struct.St8x4) align 8 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[32])
; CHECK-LABEL: callee_St8x4(
; CHECK-NEXT: .param .align 16 .b8 callee_St8x4_param_0[32]
; CHECK: ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0];
; CHECK: ld.param.v2.b64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16];
; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]};
; CHECK: st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]};
; CHECK-NEXT: ret;
%1 = load i64, ptr %in, align 8
%arrayidx.1 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 1
%2 = load i64, ptr %arrayidx.1, align 8
%arrayidx.2 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 2
%3 = load i64, ptr %arrayidx.2, align 8
%arrayidx.3 = getelementptr inbounds [4 x i64], ptr %in, i64 0, i64 3
%4 = load i64, ptr %arrayidx.3, align 8
%5 = insertvalue [4 x i64] poison, i64 %1, 0
%6 = insertvalue [4 x i64] %5, i64 %2, 1
%7 = insertvalue [4 x i64] %6, i64 %3, 2
%oldret = insertvalue [4 x i64] %7, i64 %4, 3
ret [4 x i64] %oldret
}
; Section 2 - checking that function argument (including retval) vectorization is done with private linkage.
define private fastcc [4 x i32] @callee_St4x4_private(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
; CHECK: .func (.param .align 16 .b8 func_retval0[16])
; CHECK-LABEL: callee_St4x4_private(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_private_param_0[16]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0];
; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%5 = insertvalue [4 x i32] poison, i32 %1, 0
%6 = insertvalue [4 x i32] %5, i32 %2, 1
%7 = insertvalue [4 x i32] %6, i32 %3, 2
%oldret = insertvalue [4 x i32] %7, i32 %4, 3
ret [4 x i32] %oldret
}
; Section 3 - checking that function argument (including retval) vectorization
; is NOT done with linkage types other than internal and private.
define external fastcc [4 x i32] @callee_St4x4_external(ptr nocapture noundef readonly byval(%struct.St4x4) align 4 %in) {
; CHECK: .func (.param .align 4 .b8 func_retval0[16])
; CHECK-LABEL: callee_St4x4_external(
; CHECK-NEXT: .param .align 4 .b8 callee_St4x4_external_param_0[16]
; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [callee_St4x4_external_param_0];
; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4];
; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8];
; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12];
; CHECK: st.param.b32 [func_retval0], [[R1]];
; CHECK: st.param.b32 [func_retval0+4], [[R2]];
; CHECK: st.param.b32 [func_retval0+8], [[R3]];
; CHECK: st.param.b32 [func_retval0+12], [[R4]];
; CHECK-NEXT: ret;
%1 = load i32, ptr %in, align 4
%arrayidx.1 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 1
%2 = load i32, ptr %arrayidx.1, align 4
%arrayidx.2 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 2
%3 = load i32, ptr %arrayidx.2, align 4
%arrayidx.3 = getelementptr inbounds [4 x i32], ptr %in, i64 0, i64 3
%4 = load i32, ptr %arrayidx.3, align 4
%5 = insertvalue [4 x i32] poison, i32 %1, 0
%6 = insertvalue [4 x i32] %5, i32 %2, 1
%7 = insertvalue [4 x i32] %6, i32 %3, 2
%oldret = insertvalue [4 x i32] %7, i32 %4, 3
ret [4 x i32] %oldret
}