Files
clang-p2996/llvm/test/CodeGen/AMDGPU/select-undef.ll
Matt Arsenault b01adc6bed AMDGPU: Strengthen some bfloat tests
Fix bitcast test, which was splitting apart phis intended to force
bitcasts that survive all the way to selection.

Disable the amdgpu-codegenprepare phi splitting, which defeats the technique
of using a phi to ensure a bitcast reaches all the way to selection. Also
add a variety of bfloat tests. These probably need revisiting to avoid the
cast folding into argument loads. Also round out set of bfloat bitcast and
ABI tests.

Add codegen tests for more bf16 operations The promotion of these works
contrary to the comment.
2023-12-20 19:33:45 +07:00

438 lines
12 KiB
LLVM

; RUN: llc -amdgpu-scalar-ir-passes=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}select_undef_lhs:
; GCN: s_waitcnt
; GCN-NOT: v_cmp
; GCN-NOT: v_cndmask
; GCN-NEXT: s_setpc_b64
define float @select_undef_lhs(float %val, i1 %cond) {
%sel = select i1 %cond, float undef, float %val
ret float %sel
}
; GCN-LABEL: {{^}}select_undef_rhs:
; GCN: s_waitcnt
; GCN-NOT: v_cmp
; GCN-NOT: v_cndmask
; GCN-NEXT: s_setpc_b64
define float @select_undef_rhs(float %val, i1 %cond) {
%sel = select i1 %cond, float %val, float undef
ret float %sel
}
; GCN-LABEL: {{^}}select_undef_n1:
; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
; GCN: store_dword {{[^,]+}}, [[RES]]
define void @select_undef_n1(ptr addrspace(1) %a, i32 %c) {
%cc = icmp eq i32 %c, 0
%sel = select i1 %cc, float 1.000000e+00, float undef
store float %sel, ptr addrspace(1) %a
ret void
}
; GCN-LABEL: {{^}}select_undef_n2:
; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
; GCN: store_dword {{[^,]+}}, [[RES]]
define void @select_undef_n2(ptr addrspace(1) %a, i32 %c) {
%cc = icmp eq i32 %c, 0
%sel = select i1 %cc, float undef, float 1.000000e+00
store float %sel, ptr addrspace(1) %a
ret void
}
declare float @llvm.amdgcn.rcp.f32(float)
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v6f32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <6 x float>, ptr addrspace(3) undef
%add = fadd <6 x float> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <6 x float> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v6i32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <6 x i32>, ptr addrspace(3) undef
%add = add <6 x i32> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <6 x i32> %add, ptr addrspace(3) undef
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v5f32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <5 x float>, ptr addrspace(3) undef
%add = fadd <5 x float> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <5 x float> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v5i32:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <5 x i32>, ptr addrspace(3) undef
%add = add <5 x i32> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <5 x i32> %add, ptr addrspace(3) undef
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v3f64:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <3 x double>, ptr addrspace(3) %ptr
%add = fadd <3 x double> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <3 x double> %add, ptr addrspace(3) %ptr
ret void
}
; GCN-LABEL: {{^}}undef_v3i64:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <3 x i64>, ptr addrspace(3) %ptr
%add = add <3 x i64> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <3 x i64> %add, ptr addrspace(3) %ptr
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v4f16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <4 x half>, ptr addrspace(3) %ptr
%add = fadd <4 x half> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <4 x half> %add, ptr addrspace(3) %ptr
ret void
}
; GCN-LABEL: {{^}}undef_v4i16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <4 x i16>, ptr addrspace(3) %ptr
%add = add <4 x i16> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <4 x i16> %add, ptr addrspace(3) %ptr
ret void
}
; Make sure the vector undef isn't lowered into 0s.
; GCN-LABEL: {{^}}undef_v2f16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <2 x half>, ptr addrspace(3) %ptr
%add = fadd <2 x half> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <2 x half> %add, ptr addrspace(3) %ptr
ret void
}
; GCN-LABEL: {{^}}undef_v2i16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <2 x i16>, ptr addrspace(3) %ptr
%add = add <2 x i16> %load, %phi
br i1 %cond, label %loop, label %ret
ret:
store volatile <2 x i16> %add, ptr addrspace(3) %ptr
ret void
}
; We were expanding undef vectors into zero vectors. Optimizations
; would then see we used no elements of the vector, and reform the
; undef vector resulting in a combiner loop.
; GCN-LABEL: {{^}}inf_loop_undef_vector:
; GCN: s_waitcnt
; GCN-NEXT: v_mad_u64_u32
; GCN-NEXT: v_mul_lo_u32
; GCN-NEXT: v_mul_lo_u32
; GCN-NEXT: v_add3_u32
; GCN-NEXT: global_store_dwordx2
define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
%i = insertelement <6 x float> %arg, float %arg1, i64 2
%i3 = bitcast <6 x float> %i to <3 x i64>
%i4 = extractelement <3 x i64> %i3, i64 0
%i5 = extractelement <3 x i64> %i3, i64 1
%i6 = mul i64 %i5, %arg2
%i7 = add i64 %i6, %i4
store volatile i64 %i7, ptr addrspace(1) undef, align 4
ret void
}
; GCN-LABEL: {{^}}undef_bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi bfloat [ undef, %entry ], [ %add, %loop ]
%load = load volatile bfloat, ptr addrspace(3) undef
%bc.0 = bitcast bfloat %load to i16
%bc.1 = bitcast bfloat %phi to i16
%add.i = add i16 %bc.0, %bc.1
%add = bitcast i16 %add.i to bfloat
br i1 %cond, label %loop, label %ret
ret:
store volatile bfloat %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v2bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <2 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <2 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <2 x bfloat> %load to <2 x i16>
%bc.1 = bitcast <2 x bfloat> %phi to <2 x i16>
%add.i = add <2 x i16> %bc.0, %bc.1
%add = bitcast <2 x i16> %add.i to <2 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <2 x bfloat> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v3bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <3 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <3 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <3 x bfloat> %load to <3 x i16>
%bc.1 = bitcast <3 x bfloat> %phi to <3 x i16>
%add.i = add <3 x i16> %bc.0, %bc.1
%add = bitcast <3 x i16> %add.i to <3 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <3 x bfloat> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v4bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <4 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <4 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <4 x bfloat> %load to <4 x i16>
%bc.1 = bitcast <4 x bfloat> %phi to <4 x i16>
%add.i = add <4 x i16> %bc.0, %bc.1
%add = bitcast <4 x i16> %add.i to <4 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <4 x bfloat> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v6bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <6 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <6 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <6 x bfloat> %load to <6 x i16>
%bc.1 = bitcast <6 x bfloat> %phi to <6 x i16>
%add.i = add <6 x i16> %bc.0, %bc.1
%add = bitcast <6 x i16> %add.i to <6 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <6 x bfloat> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v8bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <8 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <8 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <8 x bfloat> %load to <8 x i16>
%bc.1 = bitcast <8 x bfloat> %phi to <8 x i16>
%add.i = add <8 x i16> %bc.0, %bc.1
%add = bitcast <8 x i16> %add.i to <8 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <8 x bfloat> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v16bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <16 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <16 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <16 x bfloat> %load to <16 x i16>
%bc.1 = bitcast <16 x bfloat> %phi to <16 x i16>
%add.i = add <16 x i16> %bc.0, %bc.1
%add = bitcast <16 x i16> %add.i to <16 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <16 x bfloat> %add, ptr addrspace(3) undef
ret void
}
; GCN-LABEL: {{^}}undef_v32bf16:
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
; GCN: s_cbranch_vccnz
define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) {
entry:
br label %loop
loop:
%phi = phi <32 x bfloat> [ undef, %entry ], [ %add, %loop ]
%load = load volatile <32 x bfloat>, ptr addrspace(3) undef
%bc.0 = bitcast <32 x bfloat> %load to <32 x i16>
%bc.1 = bitcast <32 x bfloat> %phi to <32 x i16>
%add.i = add <32 x i16> %bc.0, %bc.1
%add = bitcast <32 x i16> %add.i to <32 x bfloat>
br i1 %cond, label %loop, label %ret
ret:
store volatile <32 x bfloat> %add, ptr addrspace(3) undef
ret void
}