Files
clang-p2996/llvm/test/Transforms/SLPVectorizer/AMDGPU/vectorize-i8.ll
Gheorghe-Teodor Bercea 3df36a2b18 [AMDGPU] Enable vectorization of i8 values. (#134934)
This patch adjusts the cost model to account for the ability of the
AMDGPU optimizer to group together i8 values into i32 values.

Co-authored-by: Erich Keane <ekeane@nvidia.com>
2025-06-26 19:15:31 -04:00

872 lines
55 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX7 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX8 %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=slp-vectorizer %s | FileCheck -check-prefix=GFX9 %s
define protected amdgpu_kernel void @arith_2(<16 x i8> %invec, ptr %out, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @arith_2(
; GFX7-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX7-NEXT: [[ENTRY:.*:]]
; GFX7-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
; GFX7-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
; GFX7-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1
; GFX7-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1
; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1
; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1
; GFX7-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @arith_2(
; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX8-NEXT: [[ENTRY:.*:]]
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
; GFX8-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @arith_2(
; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0:[0-9]+]] {
; GFX9-NEXT: [[ENTRY:.*:]]
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
; GFX9-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX9-NEXT: ret void
;
entry:
%el0 = extractelement <16 x i8> %invec, i64 0
%el1 = extractelement <16 x i8> %invec, i64 1
%mul0 = mul i8 %el0, 1
%mul1 = mul i8 %el1, 1
%add0 = add i8 %mul0, 1
%add1 = add i8 %mul1, 1
%vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
%vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
store <16 x i8> %vecins1, ptr %out
ret void
}
define protected amdgpu_kernel void @arith_3(<16 x i8> %invec, ptr %out, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @arith_3(
; GFX7-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*:]]
; GFX7-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
; GFX7-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
; GFX7-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
; GFX7-NEXT: [[MUL2:%.*]] = mul i8 [[EL0]], 1
; GFX7-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1
; GFX7-NEXT: [[MUL3:%.*]] = mul i8 [[EL2]], 1
; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1
; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1
; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1
; GFX7-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD2]], i64 0
; GFX7-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
; GFX7-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD3]], i64 2
; GFX7-NEXT: store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @arith_3(
; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*:]]
; GFX8-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
; GFX8-NEXT: [[MUL3:%.*]] = mul i8 [[EL0]], 1
; GFX8-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 1, i32 2>
; GFX8-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
; GFX8-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
; GFX8-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD3]], i64 0
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VECINS2:%.*]] = shufflevector <16 x i8> [[VECINS0]], <16 x i8> [[TMP3]], <16 x i32> <i32 0, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @arith_3(
; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*:]]
; GFX9-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
; GFX9-NEXT: [[MUL3:%.*]] = mul i8 [[EL0]], 1
; GFX9-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <2 x i32> <i32 1, i32 2>
; GFX9-NEXT: [[TMP1:%.*]] = mul <2 x i8> [[TMP0]], splat (i8 1)
; GFX9-NEXT: [[TMP2:%.*]] = add <2 x i8> [[TMP1]], splat (i8 1)
; GFX9-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD3]], i64 0
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VECINS2:%.*]] = shufflevector <16 x i8> [[VECINS0]], <16 x i8> [[TMP3]], <16 x i32> <i32 0, i32 16, i32 17, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[VECINS2]], ptr [[OUT]], align 16
; GFX9-NEXT: ret void
;
entry:
%el0 = extractelement <16 x i8> %invec, i64 0
%el1 = extractelement <16 x i8> %invec, i64 1
%el2 = extractelement <16 x i8> %invec, i64 2
%mul0 = mul i8 %el0, 1
%mul1 = mul i8 %el1, 1
%mul2 = mul i8 %el2, 1
%add0 = add i8 %mul0, 1
%add1 = add i8 %mul1, 1
%add2 = add i8 %mul2, 1
%vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
%vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
%vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
store <16 x i8> %vecins2, ptr %out
ret void
}
define protected amdgpu_kernel void @arith_4(<16 x i8> %invec, ptr %out, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @arith_4(
; GFX7-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*:]]
; GFX7-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
; GFX7-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
; GFX7-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
; GFX7-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
; GFX7-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1
; GFX7-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1
; GFX7-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1
; GFX7-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1
; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1
; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1
; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1
; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1
; GFX7-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
; GFX7-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
; GFX7-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @arith_4(
; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*:]]
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
; GFX8-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @arith_4(
; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*:]]
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
; GFX9-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX9-NEXT: ret void
;
entry:
%el0 = extractelement <16 x i8> %invec, i64 0
%el1 = extractelement <16 x i8> %invec, i64 1
%el2 = extractelement <16 x i8> %invec, i64 2
%el3 = extractelement <16 x i8> %invec, i64 3
%mul0 = mul i8 %el0, 1
%mul1 = mul i8 %el1, 1
%mul2 = mul i8 %el2, 1
%mul3 = mul i8 %el3, 1
%add0 = add i8 %mul0, 1
%add1 = add i8 %mul1, 1
%add2 = add i8 %mul2, 1
%add3 = add i8 %mul3, 1
%vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
%vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
%vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
%vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3
store <16 x i8> %vecins3, ptr %out
ret void
}
define protected amdgpu_kernel void @arith_16(<16 x i8> %invec, ptr %out, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @arith_16(
; GFX7-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*:]]
; GFX7-NEXT: [[EL0:%.*]] = extractelement <16 x i8> [[INVEC]], i64 0
; GFX7-NEXT: [[EL1:%.*]] = extractelement <16 x i8> [[INVEC]], i64 1
; GFX7-NEXT: [[EL2:%.*]] = extractelement <16 x i8> [[INVEC]], i64 2
; GFX7-NEXT: [[EL3:%.*]] = extractelement <16 x i8> [[INVEC]], i64 3
; GFX7-NEXT: [[EL4:%.*]] = extractelement <16 x i8> [[INVEC]], i64 4
; GFX7-NEXT: [[EL5:%.*]] = extractelement <16 x i8> [[INVEC]], i64 5
; GFX7-NEXT: [[EL6:%.*]] = extractelement <16 x i8> [[INVEC]], i64 6
; GFX7-NEXT: [[EL7:%.*]] = extractelement <16 x i8> [[INVEC]], i64 7
; GFX7-NEXT: [[EL8:%.*]] = extractelement <16 x i8> [[INVEC]], i64 8
; GFX7-NEXT: [[EL9:%.*]] = extractelement <16 x i8> [[INVEC]], i64 9
; GFX7-NEXT: [[EL10:%.*]] = extractelement <16 x i8> [[INVEC]], i64 10
; GFX7-NEXT: [[EL11:%.*]] = extractelement <16 x i8> [[INVEC]], i64 11
; GFX7-NEXT: [[EL12:%.*]] = extractelement <16 x i8> [[INVEC]], i64 12
; GFX7-NEXT: [[EL13:%.*]] = extractelement <16 x i8> [[INVEC]], i64 13
; GFX7-NEXT: [[EL14:%.*]] = extractelement <16 x i8> [[INVEC]], i64 14
; GFX7-NEXT: [[EL15:%.*]] = extractelement <16 x i8> [[INVEC]], i64 15
; GFX7-NEXT: [[MUL0:%.*]] = mul i8 [[EL0]], 1
; GFX7-NEXT: [[MUL1:%.*]] = mul i8 [[EL1]], 1
; GFX7-NEXT: [[MUL2:%.*]] = mul i8 [[EL2]], 1
; GFX7-NEXT: [[MUL3:%.*]] = mul i8 [[EL3]], 1
; GFX7-NEXT: [[MUL4:%.*]] = mul i8 [[EL4]], 1
; GFX7-NEXT: [[MUL5:%.*]] = mul i8 [[EL5]], 1
; GFX7-NEXT: [[MUL6:%.*]] = mul i8 [[EL6]], 1
; GFX7-NEXT: [[MUL7:%.*]] = mul i8 [[EL7]], 1
; GFX7-NEXT: [[MUL8:%.*]] = mul i8 [[EL8]], 1
; GFX7-NEXT: [[MUL9:%.*]] = mul i8 [[EL9]], 1
; GFX7-NEXT: [[MUL10:%.*]] = mul i8 [[EL10]], 1
; GFX7-NEXT: [[MUL11:%.*]] = mul i8 [[EL11]], 1
; GFX7-NEXT: [[MUL12:%.*]] = mul i8 [[EL12]], 1
; GFX7-NEXT: [[MUL13:%.*]] = mul i8 [[EL13]], 1
; GFX7-NEXT: [[MUL14:%.*]] = mul i8 [[EL14]], 1
; GFX7-NEXT: [[MUL15:%.*]] = mul i8 [[EL15]], 1
; GFX7-NEXT: [[ADD0:%.*]] = add i8 [[MUL0]], 1
; GFX7-NEXT: [[ADD1:%.*]] = add i8 [[MUL1]], 1
; GFX7-NEXT: [[ADD2:%.*]] = add i8 [[MUL2]], 1
; GFX7-NEXT: [[ADD3:%.*]] = add i8 [[MUL3]], 1
; GFX7-NEXT: [[ADD4:%.*]] = add i8 [[MUL4]], 1
; GFX7-NEXT: [[ADD5:%.*]] = add i8 [[MUL5]], 1
; GFX7-NEXT: [[ADD6:%.*]] = add i8 [[MUL6]], 1
; GFX7-NEXT: [[ADD7:%.*]] = add i8 [[MUL7]], 1
; GFX7-NEXT: [[ADD8:%.*]] = add i8 [[MUL8]], 1
; GFX7-NEXT: [[ADD9:%.*]] = add i8 [[MUL9]], 1
; GFX7-NEXT: [[ADD10:%.*]] = add i8 [[MUL10]], 1
; GFX7-NEXT: [[ADD11:%.*]] = add i8 [[MUL11]], 1
; GFX7-NEXT: [[ADD12:%.*]] = add i8 [[MUL12]], 1
; GFX7-NEXT: [[ADD13:%.*]] = add i8 [[MUL13]], 1
; GFX7-NEXT: [[ADD14:%.*]] = add i8 [[MUL14]], 1
; GFX7-NEXT: [[ADD15:%.*]] = add i8 [[MUL15]], 1
; GFX7-NEXT: [[VECINS0:%.*]] = insertelement <16 x i8> poison, i8 [[ADD0]], i64 0
; GFX7-NEXT: [[VECINS1:%.*]] = insertelement <16 x i8> [[VECINS0]], i8 [[ADD1]], i64 1
; GFX7-NEXT: [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[ADD2]], i64 2
; GFX7-NEXT: [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[ADD3]], i64 3
; GFX7-NEXT: [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[ADD4]], i64 4
; GFX7-NEXT: [[VECINS5:%.*]] = insertelement <16 x i8> [[VECINS4]], i8 [[ADD5]], i64 5
; GFX7-NEXT: [[VECINS6:%.*]] = insertelement <16 x i8> [[VECINS5]], i8 [[ADD6]], i64 6
; GFX7-NEXT: [[VECINS7:%.*]] = insertelement <16 x i8> [[VECINS6]], i8 [[ADD7]], i64 7
; GFX7-NEXT: [[VECINS8:%.*]] = insertelement <16 x i8> [[VECINS7]], i8 [[ADD8]], i64 8
; GFX7-NEXT: [[VECINS9:%.*]] = insertelement <16 x i8> [[VECINS8]], i8 [[ADD9]], i64 9
; GFX7-NEXT: [[VECINS10:%.*]] = insertelement <16 x i8> [[VECINS9]], i8 [[ADD10]], i64 10
; GFX7-NEXT: [[VECINS11:%.*]] = insertelement <16 x i8> [[VECINS10]], i8 [[ADD11]], i64 11
; GFX7-NEXT: [[VECINS12:%.*]] = insertelement <16 x i8> [[VECINS11]], i8 [[ADD12]], i64 12
; GFX7-NEXT: [[VECINS13:%.*]] = insertelement <16 x i8> [[VECINS12]], i8 [[ADD13]], i64 13
; GFX7-NEXT: [[VECINS14:%.*]] = insertelement <16 x i8> [[VECINS13]], i8 [[ADD14]], i64 14
; GFX7-NEXT: [[VECINS153:%.*]] = insertelement <16 x i8> [[VECINS14]], i8 [[ADD15]], i64 15
; GFX7-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @arith_16(
; GFX8-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*:]]
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
; GFX8-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; GFX8-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
; GFX8-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
; GFX8-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; GFX8-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
; GFX8-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
; GFX8-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; GFX8-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
; GFX8-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
; GFX8-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; GFX8-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
; GFX8-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
; GFX8-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @arith_16(
; GFX9-SAME: <16 x i8> [[INVEC:%.*]], ptr [[OUT:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*:]]
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[TMP0]], splat (i8 1)
; GFX9-NEXT: [[TMP2:%.*]] = add <4 x i8> [[TMP1]], splat (i8 1)
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
; GFX9-NEXT: [[TMP4:%.*]] = mul <4 x i8> [[TMP3]], splat (i8 1)
; GFX9-NEXT: [[TMP5:%.*]] = add <4 x i8> [[TMP4]], splat (i8 1)
; GFX9-NEXT: [[TMP6:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
; GFX9-NEXT: [[TMP7:%.*]] = mul <4 x i8> [[TMP6]], splat (i8 1)
; GFX9-NEXT: [[TMP8:%.*]] = add <4 x i8> [[TMP7]], splat (i8 1)
; GFX9-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[INVEC]], <16 x i8> poison, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; GFX9-NEXT: [[TMP10:%.*]] = mul <4 x i8> [[TMP9]], splat (i8 1)
; GFX9-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP10]], splat (i8 1)
; GFX9-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VECINS71:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> [[TMP13]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; GFX9-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP8]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VECINS112:%.*]] = shufflevector <16 x i8> [[VECINS71]], <16 x i8> [[TMP14]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
; GFX9-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VECINS153:%.*]] = shufflevector <16 x i8> [[VECINS112]], <16 x i8> [[TMP15]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
; GFX9-NEXT: store <16 x i8> [[VECINS153]], ptr [[OUT]], align 16
; GFX9-NEXT: ret void
;
entry:
%el0 = extractelement <16 x i8> %invec, i64 0
%el1 = extractelement <16 x i8> %invec, i64 1
%el2 = extractelement <16 x i8> %invec, i64 2
%el3 = extractelement <16 x i8> %invec, i64 3
%el4 = extractelement <16 x i8> %invec, i64 4
%el5 = extractelement <16 x i8> %invec, i64 5
%el6 = extractelement <16 x i8> %invec, i64 6
%el7 = extractelement <16 x i8> %invec, i64 7
%el8 = extractelement <16 x i8> %invec, i64 8
%el9 = extractelement <16 x i8> %invec, i64 9
%el10 = extractelement <16 x i8> %invec, i64 10
%el11 = extractelement <16 x i8> %invec, i64 11
%el12 = extractelement <16 x i8> %invec, i64 12
%el13 = extractelement <16 x i8> %invec, i64 13
%el14 = extractelement <16 x i8> %invec, i64 14
%el15 = extractelement <16 x i8> %invec, i64 15
%mul0 = mul i8 %el0, 1
%mul1 = mul i8 %el1, 1
%mul2 = mul i8 %el2, 1
%mul3 = mul i8 %el3, 1
%mul4 = mul i8 %el4, 1
%mul5 = mul i8 %el5, 1
%mul6 = mul i8 %el6, 1
%mul7 = mul i8 %el7, 1
%mul8 = mul i8 %el8, 1
%mul9 = mul i8 %el9, 1
%mul10 = mul i8 %el10, 1
%mul11 = mul i8 %el11, 1
%mul12 = mul i8 %el12, 1
%mul13 = mul i8 %el13, 1
%mul14 = mul i8 %el14, 1
%mul15 = mul i8 %el15, 1
%add0 = add i8 %mul0, 1
%add1 = add i8 %mul1, 1
%add2 = add i8 %mul2, 1
%add3 = add i8 %mul3, 1
%add4 = add i8 %mul4, 1
%add5 = add i8 %mul5, 1
%add6 = add i8 %mul6, 1
%add7 = add i8 %mul7, 1
%add8 = add i8 %mul8, 1
%add9 = add i8 %mul9, 1
%add10 = add i8 %mul10, 1
%add11 = add i8 %mul11, 1
%add12 = add i8 %mul12, 1
%add13 = add i8 %mul13, 1
%add14 = add i8 %mul14, 1
%add15 = add i8 %mul15, 1
%vecins0 = insertelement <16 x i8> poison, i8 %add0, i64 0
%vecins1 = insertelement <16 x i8> %vecins0, i8 %add1, i64 1
%vecins2 = insertelement <16 x i8> %vecins1, i8 %add2, i64 2
%vecins3 = insertelement <16 x i8> %vecins2, i8 %add3, i64 3
%vecins4 = insertelement <16 x i8> %vecins3, i8 %add4, i64 4
%vecins5 = insertelement <16 x i8> %vecins4, i8 %add5, i64 5
%vecins6 = insertelement <16 x i8> %vecins5, i8 %add6, i64 6
%vecins7 = insertelement <16 x i8> %vecins6, i8 %add7, i64 7
%vecins8 = insertelement <16 x i8> %vecins7, i8 %add8, i64 8
%vecins9 = insertelement <16 x i8> %vecins8, i8 %add9, i64 9
%vecins10 = insertelement <16 x i8> %vecins9, i8 %add10, i64 10
%vecins11 = insertelement <16 x i8> %vecins10, i8 %add11, i64 11
%vecins12 = insertelement <16 x i8> %vecins11, i8 %add12, i64 12
%vecins13 = insertelement <16 x i8> %vecins12, i8 %add13, i64 13
%vecins14 = insertelement <16 x i8> %vecins13, i8 %add14, i64 14
%vecins15 = insertelement <16 x i8> %vecins14, i8 %add15, i64 15
store <16 x i8> %vecins15, ptr %out
ret void
}
define protected amdgpu_kernel void @phi_2(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @phi_2(
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
; GFX7-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
; GFX7-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX7: [[EXIT]]:
; GFX7-NEXT: store <16 x i8> [[VEC111]], ptr [[OUT]], align 16
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @phi_2(
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX8-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX8-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX8: [[EXIT]]:
; GFX8-NEXT: store <16 x i8> [[VEC111]], ptr [[OUT]], align 16
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @phi_2(
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX9-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[VEC111]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX9: [[EXIT]]:
; GFX9-NEXT: store <16 x i8> [[VEC111]], ptr [[OUT]], align 16
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX9-NEXT: ret void
;
entry:
%gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
%ele0 = load i8, ptr addrspace(3) %gep0, align 8
%gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
%ele1 = load i8, ptr addrspace(3) %gep1, align 1
br label %do.body
do.body:
%phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
%phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
%otherele0 = load i8, ptr addrspace(3) %gep0, align 8
%otherele1 = load i8, ptr addrspace(3) %gep1, align 1
%vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8
%vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9
%vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
%vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
store <16 x i8> %vec11, ptr addrspace(3) %inptr1, align 2
%cmp = icmp eq i32 %flag, 0
br i1 %cmp, label %exit, label %do.body
exit:
store <16 x i8> %vec11, ptr %out
store <16 x i8> %vec01, ptr %out1
ret void
}
define protected amdgpu_kernel void @phi_3(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @phi_3(
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
; GFX7-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
; GFX7-NEXT: [[VEC111:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
; GFX7-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX7: [[EXIT]]:
; GFX7-NEXT: store <16 x i8> [[VEC12]], ptr [[OUT]], align 16
; GFX7-NEXT: store <16 x i8> [[VEC02]], ptr [[OUT1]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @phi_3(
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX8-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX8-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
; GFX8-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX8-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
; GFX8-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
; GFX8-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX8: [[EXIT]]:
; GFX8-NEXT: store <16 x i8> [[VEC12]], ptr [[OUT]], align 16
; GFX8-NEXT: store <16 x i8> [[VEC02]], ptr [[OUT1]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @phi_3(
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX9-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
; GFX9-NEXT: [[TMP1:%.*]] = phi <2 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX9-NEXT: [[TMP2]] = load <2 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[OTHERELE2]], i64 10
; GFX9-NEXT: [[VEC111:%.*]] = shufflevector <2 x i8> [[TMP1]], <2 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC111]], i8 [[PHI1]], i64 10
; GFX9-NEXT: store <16 x i8> [[VEC12]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX9: [[EXIT]]:
; GFX9-NEXT: store <16 x i8> [[VEC12]], ptr [[OUT]], align 16
; GFX9-NEXT: store <16 x i8> [[VEC02]], ptr [[OUT1]], align 16
; GFX9-NEXT: ret void
;
entry:
%gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
%ele0 = load i8, ptr addrspace(3) %gep0, align 8
%gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
%ele1 = load i8, ptr addrspace(3) %gep1, align 1
%gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2
%ele2 = load i8, ptr addrspace(3) %gep2, align 2
br label %do.body
do.body:
%phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ]
%phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
%phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
%otherele0 = load i8, ptr addrspace(3) %gep0, align 8
%otherele1 = load i8, ptr addrspace(3) %gep1, align 1
%otherele2 = load i8, ptr addrspace(3) %gep2, align 2
%vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8
%vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9
%vec02 = insertelement <16 x i8> %vec01, i8 %otherele2, i64 10
%vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
%vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
%vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10
store <16 x i8> %vec12, ptr addrspace(3) %inptr1, align 2
%cmp = icmp eq i32 %flag, 0
br i1 %cmp, label %exit, label %do.body
exit:
store <16 x i8> %vec12, ptr %out
store <16 x i8> %vec02, ptr %out1
ret void
}
define protected amdgpu_kernel void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @phi_4(
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX7-NEXT: [[ELE0:%.*]] = load i8, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 1
; GFX7-NEXT: [[ELE1:%.*]] = load i8, ptr addrspace(3) [[GEP1]], align 1
; GFX7-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 2
; GFX7-NEXT: [[ELE2:%.*]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX7-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 3
; GFX7-NEXT: [[ELE3:%.*]] = load i8, ptr addrspace(3) [[GEP3]], align 1
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[PHI0:%.*]] = phi i8 [ [[ELE3]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[PHI1:%.*]] = phi i8 [ [[ELE2]], %[[ENTRY]] ], [ [[OTHERELE2:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[PHI2:%.*]] = phi i8 [ [[ELE1]], %[[ENTRY]] ], [ [[OTHERELE1:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[PHI3:%.*]] = phi i8 [ [[ELE0]], %[[ENTRY]] ], [ [[OTHERELE0:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[OTHERELE0]] = load i8, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: [[OTHERELE1]] = load i8, ptr addrspace(3) [[GEP1]], align 1
; GFX7-NEXT: [[OTHERELE2]] = load i8, ptr addrspace(3) [[GEP2]], align 2
; GFX7-NEXT: [[OTHERELE3]] = load i8, ptr addrspace(3) [[GEP3]], align 1
; GFX7-NEXT: [[VEC00:%.*]] = insertelement <16 x i8> poison, i8 [[OTHERELE0]], i64 8
; GFX7-NEXT: [[VEC01:%.*]] = insertelement <16 x i8> [[VEC00]], i8 [[OTHERELE1]], i64 9
; GFX7-NEXT: [[VEC02:%.*]] = insertelement <16 x i8> [[VEC01]], i8 [[OTHERELE2]], i64 10
; GFX7-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[VEC02]], i8 [[OTHERELE3]], i64 11
; GFX7-NEXT: [[VEC10:%.*]] = insertelement <16 x i8> poison, i8 [[PHI3]], i64 8
; GFX7-NEXT: [[VEC11:%.*]] = insertelement <16 x i8> [[VEC10]], i8 [[PHI2]], i64 9
; GFX7-NEXT: [[VEC12:%.*]] = insertelement <16 x i8> [[VEC11]], i8 [[PHI1]], i64 10
; GFX7-NEXT: [[VEC131:%.*]] = insertelement <16 x i8> [[VEC12]], i8 [[PHI0]], i64 11
; GFX7-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX7: [[EXIT]]:
; GFX7-NEXT: store <16 x i8> [[VEC131]], ptr [[OUT]], align 16
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @phi_4(
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: [[VEC131:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX8: [[EXIT]]:
; GFX8-NEXT: store <16 x i8> [[VEC131]], ptr [[OUT]], align 16
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @phi_4(
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX9-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: [[VEC131:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[VEC131]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX9: [[EXIT]]:
; GFX9-NEXT: store <16 x i8> [[VEC131]], ptr [[OUT]], align 16
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX9-NEXT: ret void
;
entry:
%gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
%ele0 = load i8, ptr addrspace(3) %gep0, align 8
%gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
%ele1 = load i8, ptr addrspace(3) %gep1, align 1
%gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2
%ele2 = load i8, ptr addrspace(3) %gep2, align 2
%gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3
%ele3 = load i8, ptr addrspace(3) %gep3, align 1
br label %do.body
do.body:
%phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ]
%phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ]
%phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
%phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
%otherele0 = load i8, ptr addrspace(3) %gep0, align 8
%otherele1 = load i8, ptr addrspace(3) %gep1, align 1
%otherele2 = load i8, ptr addrspace(3) %gep2, align 2
%otherele3 = load i8, ptr addrspace(3) %gep3, align 1
%vec00 = insertelement <16 x i8> poison, i8 %otherele0, i64 8
%vec01 = insertelement <16 x i8> %vec00, i8 %otherele1, i64 9
%vec02 = insertelement <16 x i8> %vec01, i8 %otherele2, i64 10
%vec03 = insertelement <16 x i8> %vec02, i8 %otherele3, i64 11
%vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
%vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
%vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10
%vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11
store <16 x i8> %vec13, ptr addrspace(3) %inptr1, align 2
%cmp = icmp eq i32 %flag, 0
br i1 %cmp, label %exit, label %do.body
exit:
store <16 x i8> %vec13, ptr %out
store <16 x i8> %vec03, ptr %out1
ret void
}
define protected amdgpu_kernel void @phi_4_with_stores(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr %out1, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @phi_4_with_stores(
; GFX7-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX7-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX7-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX7-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
; GFX7-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX7: [[EXIT]]:
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX7-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @phi_4_with_stores(
; GFX8-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX8-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX8-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX8-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX8: [[EXIT]]:
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX8-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @phi_4_with_stores(
; GFX9-SAME: ptr addrspace(3) [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR0]], i32 0
; GFX9-NEXT: [[TMP0:%.*]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[TMP1:%.*]] = phi <4 x i8> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP2:%.*]], %[[DO_BODY]] ]
; GFX9-NEXT: [[TMP2]] = load <4 x i8>, ptr addrspace(3) [[GEP0]], align 8
; GFX9-NEXT: store <4 x i8> [[TMP1]], ptr addrspace(3) [[GEP0]], align 2
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> poison, <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr addrspace(3) [[INPTR1]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX9: [[EXIT]]:
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT]], align 16
; GFX9-NEXT: store <16 x i8> [[TMP3]], ptr [[OUT1]], align 16
; GFX9-NEXT: ret void
;
entry:
%gep0 = getelementptr i8, ptr addrspace(3) %inptr0, i32 0
%ele0 = load i8, ptr addrspace(3) %gep0, align 8
%gep1 = getelementptr i8, ptr addrspace(3) %inptr0, i32 1
%ele1 = load i8, ptr addrspace(3) %gep1, align 1
%gep2 = getelementptr i8, ptr addrspace(3) %inptr0, i32 2
%ele2 = load i8, ptr addrspace(3) %gep2, align 2
%gep3 = getelementptr i8, ptr addrspace(3) %inptr0, i32 3
%ele3 = load i8, ptr addrspace(3) %gep3, align 1
br label %do.body
do.body:
%phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ]
%phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ]
%phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
%phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
%otherele0 = load i8, ptr addrspace(3) %gep0, align 8
%otherele1 = load i8, ptr addrspace(3) %gep1, align 1
%otherele2 = load i8, ptr addrspace(3) %gep2, align 2
%otherele3 = load i8, ptr addrspace(3) %gep3, align 1
store i8 %phi3, ptr addrspace(3) %gep0, align 2
store i8 %phi2, ptr addrspace(3) %gep1, align 2
store i8 %phi1, ptr addrspace(3) %gep2, align 2
store i8 %phi0, ptr addrspace(3) %gep3, align 2
%vec10 = insertelement <16 x i8> poison, i8 %phi3, i64 8
%vec11 = insertelement <16 x i8> %vec10, i8 %phi2, i64 9
%vec12 = insertelement <16 x i8> %vec11, i8 %phi1, i64 10
%vec13 = insertelement <16 x i8> %vec12, i8 %phi0, i64 11
store <16 x i8> %vec13, ptr addrspace(3) %inptr1, align 2
%cmp = icmp eq i32 %flag, 0
br i1 %cmp, label %exit, label %do.body
exit:
store <16 x i8> %vec13, ptr %out
store <16 x i8> %vec13, ptr %out1
ret void
}
define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(<4 x i8> %inptr0, ptr addrspace(3) %inptr1, ptr %out, ptr addrspace(3) %out1, i32 %flag) {
; GFX7-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
; GFX7-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX7-NEXT: [[ENTRY:.*]]:
; GFX7-NEXT: br label %[[DO_BODY:.*]]
; GFX7: [[DO_BODY]]:
; GFX7-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
; GFX7-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR1]], i32 8
; GFX7-NEXT: store <4 x i8> [[TMP0]], ptr addrspace(3) [[GEP4]], align 2
; GFX7-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX7-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX7: [[EXIT]]:
; GFX7-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
; GFX7-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
; GFX7-NEXT: ret void
;
; GFX8-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
; GFX8-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX8-NEXT: [[ENTRY:.*]]:
; GFX8-NEXT: br label %[[DO_BODY:.*]]
; GFX8: [[DO_BODY]]:
; GFX8-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
; GFX8-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR1]], i32 8
; GFX8-NEXT: store <4 x i8> [[TMP0]], ptr addrspace(3) [[GEP4]], align 2
; GFX8-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX8-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX8: [[EXIT]]:
; GFX8-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
; GFX8-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
; GFX8-NEXT: ret void
;
; GFX9-LABEL: define protected amdgpu_kernel void @phi_4_with_stores_outside_loop(
; GFX9-SAME: <4 x i8> [[INPTR0:%.*]], ptr addrspace(3) [[INPTR1:%.*]], ptr [[OUT:%.*]], ptr addrspace(3) [[OUT1:%.*]], i32 [[FLAG:%.*]]) #[[ATTR0]] {
; GFX9-NEXT: [[ENTRY:.*]]:
; GFX9-NEXT: br label %[[DO_BODY:.*]]
; GFX9: [[DO_BODY]]:
; GFX9-NEXT: [[TMP0:%.*]] = phi <4 x i8> [ [[INPTR0]], %[[ENTRY]] ], [ [[INPTR0]], %[[DO_BODY]] ]
; GFX9-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr addrspace(3) [[INPTR1]], i32 8
; GFX9-NEXT: store <4 x i8> [[TMP0]], ptr addrspace(3) [[GEP4]], align 2
; GFX9-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0
; GFX9-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]]
; GFX9: [[EXIT]]:
; GFX9-NEXT: [[GEP0:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT1]], i32 0
; GFX9-NEXT: store <4 x i8> [[INPTR0]], ptr addrspace(3) [[GEP0]], align 1
; GFX9-NEXT: ret void
;
entry:
%ele0 = extractelement <4 x i8> %inptr0, i32 0
%ele1 = extractelement <4 x i8> %inptr0, i32 1
%ele2 = extractelement <4 x i8> %inptr0, i32 2
%ele3 = extractelement <4 x i8> %inptr0, i32 3
br label %do.body
do.body:
%phi0 = phi i8 [ %ele3, %entry ], [ %otherele3, %do.body ]
%phi1 = phi i8 [ %ele2, %entry ], [ %otherele2, %do.body ]
%phi2 = phi i8 [ %ele1, %entry ], [ %otherele1, %do.body ]
%phi3 = phi i8 [ %ele0, %entry ], [ %otherele0, %do.body ]
%otherele0 = extractelement <4 x i8> %inptr0, i32 0
%otherele1 = extractelement <4 x i8> %inptr0, i32 1
%otherele2 = extractelement <4 x i8> %inptr0, i32 2
%otherele3 = extractelement <4 x i8> %inptr0, i32 3
%gep4 = getelementptr i8, ptr addrspace(3) %inptr1, i32 8
store i8 %phi3, ptr addrspace(3) %gep4, align 2
%gep5 = getelementptr i8, ptr addrspace(3) %inptr1, i32 9
store i8 %phi2, ptr addrspace(3) %gep5, align 2
%gep6 = getelementptr i8, ptr addrspace(3) %inptr1, i32 10
store i8 %phi1, ptr addrspace(3) %gep6, align 2
%gep7 = getelementptr i8, ptr addrspace(3) %inptr1, i32 11
store i8 %phi0, ptr addrspace(3) %gep7, align 2
%cmp = icmp eq i32 %flag, 0
br i1 %cmp, label %exit, label %do.body
exit:
%gep0 = getelementptr i8, ptr addrspace(3) %out1, i32 0
%gep1 = getelementptr i8, ptr addrspace(3) %out1, i32 1
%gep2 = getelementptr i8, ptr addrspace(3) %out1, i32 2
%gep3 = getelementptr i8, ptr addrspace(3) %out1, i32 3
store i8 %otherele0, ptr addrspace(3) %gep0
store i8 %otherele1, ptr addrspace(3) %gep1
store i8 %otherele2, ptr addrspace(3) %gep2
store i8 %otherele3, ptr addrspace(3) %gep3
ret void
}