Fix bitcast test, which was splitting apart phis intended to force bitcasts that survive all the way to selection. Disable the amdgpu-codegenprepare phi splitting, which defeats the technique of using a phi to ensure a bitcast reaches all the way to selection. Also add a variety of bfloat tests. These probably need revisiting to avoid the cast folding into argument loads. Also round out set of bfloat bitcast and ABI tests. Add codegen tests for more bf16 operations The promotion of these works contrary to the comment.
438 lines
12 KiB
LLVM
438 lines
12 KiB
LLVM
; RUN: llc -amdgpu-scalar-ir-passes=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
|
|
|
|
; GCN-LABEL: {{^}}select_undef_lhs:
|
|
; GCN: s_waitcnt
|
|
; GCN-NOT: v_cmp
|
|
; GCN-NOT: v_cndmask
|
|
; GCN-NEXT: s_setpc_b64
|
|
define float @select_undef_lhs(float %val, i1 %cond) {
|
|
%sel = select i1 %cond, float undef, float %val
|
|
ret float %sel
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}select_undef_rhs:
|
|
; GCN: s_waitcnt
|
|
; GCN-NOT: v_cmp
|
|
; GCN-NOT: v_cndmask
|
|
; GCN-NEXT: s_setpc_b64
|
|
define float @select_undef_rhs(float %val, i1 %cond) {
|
|
%sel = select i1 %cond, float %val, float undef
|
|
ret float %sel
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}select_undef_n1:
|
|
; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
|
|
; GCN: store_dword {{[^,]+}}, [[RES]]
|
|
define void @select_undef_n1(ptr addrspace(1) %a, i32 %c) {
|
|
%cc = icmp eq i32 %c, 0
|
|
%sel = select i1 %cc, float 1.000000e+00, float undef
|
|
store float %sel, ptr addrspace(1) %a
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}select_undef_n2:
|
|
; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 1.0
|
|
; GCN: store_dword {{[^,]+}}, [[RES]]
|
|
define void @select_undef_n2(ptr addrspace(1) %a, i32 %c) {
|
|
%cc = icmp eq i32 %c, 0
|
|
%sel = select i1 %cc, float undef, float 1.000000e+00
|
|
store float %sel, ptr addrspace(1) %a
|
|
ret void
|
|
}
|
|
|
|
declare float @llvm.amdgcn.rcp.f32(float)
|
|
|
|
|
|
; Make sure the vector undef isn't lowered into 0s.
|
|
; GCN-LABEL: {{^}}undef_v6f32:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v6f32(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <6 x float> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <6 x float>, ptr addrspace(3) undef
|
|
%add = fadd <6 x float> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <6 x float> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v6i32:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v6i32(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <6 x i32> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <6 x i32>, ptr addrspace(3) undef
|
|
%add = add <6 x i32> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <6 x i32> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; Make sure the vector undef isn't lowered into 0s.
|
|
; GCN-LABEL: {{^}}undef_v5f32:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v5f32(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <5 x float> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <5 x float>, ptr addrspace(3) undef
|
|
%add = fadd <5 x float> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <5 x float> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v5i32:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v5i32(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <5 x i32> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <5 x i32>, ptr addrspace(3) undef
|
|
%add = add <5 x i32> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <5 x i32> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; Make sure the vector undef isn't lowered into 0s.
|
|
; GCN-LABEL: {{^}}undef_v3f64:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v3f64(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <3 x double> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <3 x double>, ptr addrspace(3) %ptr
|
|
%add = fadd <3 x double> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <3 x double> %add, ptr addrspace(3) %ptr
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v3i64:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v3i64(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <3 x i64> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <3 x i64>, ptr addrspace(3) %ptr
|
|
%add = add <3 x i64> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <3 x i64> %add, ptr addrspace(3) %ptr
|
|
ret void
|
|
}
|
|
|
|
; Make sure the vector undef isn't lowered into 0s.
|
|
; GCN-LABEL: {{^}}undef_v4f16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v4f16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <4 x half> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <4 x half>, ptr addrspace(3) %ptr
|
|
%add = fadd <4 x half> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <4 x half> %add, ptr addrspace(3) %ptr
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v4i16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v4i16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <4 x i16> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <4 x i16>, ptr addrspace(3) %ptr
|
|
%add = add <4 x i16> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <4 x i16> %add, ptr addrspace(3) %ptr
|
|
ret void
|
|
}
|
|
|
|
; Make sure the vector undef isn't lowered into 0s.
|
|
; GCN-LABEL: {{^}}undef_v2f16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v2f16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <2 x half> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <2 x half>, ptr addrspace(3) %ptr
|
|
%add = fadd <2 x half> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <2 x half> %add, ptr addrspace(3) %ptr
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v2i16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v2i16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <2 x i16> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <2 x i16>, ptr addrspace(3) %ptr
|
|
%add = add <2 x i16> %load, %phi
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <2 x i16> %add, ptr addrspace(3) %ptr
|
|
ret void
|
|
}
|
|
|
|
; We were expanding undef vectors into zero vectors. Optimizations
|
|
; would then see we used no elements of the vector, and reform the
|
|
; undef vector resulting in a combiner loop.
|
|
; GCN-LABEL: {{^}}inf_loop_undef_vector:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: v_mad_u64_u32
|
|
; GCN-NEXT: v_mul_lo_u32
|
|
; GCN-NEXT: v_mul_lo_u32
|
|
; GCN-NEXT: v_add3_u32
|
|
; GCN-NEXT: global_store_dwordx2
|
|
define void @inf_loop_undef_vector(<6 x float> %arg, float %arg1, i64 %arg2) {
|
|
%i = insertelement <6 x float> %arg, float %arg1, i64 2
|
|
%i3 = bitcast <6 x float> %i to <3 x i64>
|
|
%i4 = extractelement <3 x i64> %i3, i64 0
|
|
%i5 = extractelement <3 x i64> %i3, i64 1
|
|
%i6 = mul i64 %i5, %arg2
|
|
%i7 = add i64 %i6, %i4
|
|
store volatile i64 %i7, ptr addrspace(1) undef, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi bfloat [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile bfloat, ptr addrspace(3) undef
|
|
%bc.0 = bitcast bfloat %load to i16
|
|
%bc.1 = bitcast bfloat %phi to i16
|
|
%add.i = add i16 %bc.0, %bc.1
|
|
%add = bitcast i16 %add.i to bfloat
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile bfloat %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v2bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v2bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <2 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <2 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <2 x bfloat> %load to <2 x i16>
|
|
%bc.1 = bitcast <2 x bfloat> %phi to <2 x i16>
|
|
%add.i = add <2 x i16> %bc.0, %bc.1
|
|
%add = bitcast <2 x i16> %add.i to <2 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <2 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v3bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v3bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <3 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <3 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <3 x bfloat> %load to <3 x i16>
|
|
%bc.1 = bitcast <3 x bfloat> %phi to <3 x i16>
|
|
%add.i = add <3 x i16> %bc.0, %bc.1
|
|
%add = bitcast <3 x i16> %add.i to <3 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <3 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v4bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v4bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <4 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <4 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <4 x bfloat> %load to <4 x i16>
|
|
%bc.1 = bitcast <4 x bfloat> %phi to <4 x i16>
|
|
%add.i = add <4 x i16> %bc.0, %bc.1
|
|
%add = bitcast <4 x i16> %add.i to <4 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <4 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v6bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v6bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <6 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <6 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <6 x bfloat> %load to <6 x i16>
|
|
%bc.1 = bitcast <6 x bfloat> %phi to <6 x i16>
|
|
%add.i = add <6 x i16> %bc.0, %bc.1
|
|
%add = bitcast <6 x i16> %add.i to <6 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <6 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v8bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v8bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <8 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <8 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <8 x bfloat> %load to <8 x i16>
|
|
%bc.1 = bitcast <8 x bfloat> %phi to <8 x i16>
|
|
%add.i = add <8 x i16> %bc.0, %bc.1
|
|
%add = bitcast <8 x i16> %add.i to <8 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <8 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v16bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v16bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <16 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <16 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <16 x bfloat> %load to <16 x i16>
|
|
%bc.1 = bitcast <16 x bfloat> %phi to <16 x i16>
|
|
%add.i = add <16 x i16> %bc.0, %bc.1
|
|
%add = bitcast <16 x i16> %add.i to <16 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <16 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}undef_v32bf16:
|
|
; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
|
|
; GCN-NOT: s_mov_b32 s{{[0-9]+}}, 0{{$}}
|
|
; GCN: s_cbranch_vccnz
|
|
define amdgpu_kernel void @undef_v32bf16(ptr addrspace(3) %ptr, i1 %cond) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%phi = phi <32 x bfloat> [ undef, %entry ], [ %add, %loop ]
|
|
%load = load volatile <32 x bfloat>, ptr addrspace(3) undef
|
|
%bc.0 = bitcast <32 x bfloat> %load to <32 x i16>
|
|
%bc.1 = bitcast <32 x bfloat> %phi to <32 x i16>
|
|
%add.i = add <32 x i16> %bc.0, %bc.1
|
|
%add = bitcast <32 x i16> %add.i to <32 x bfloat>
|
|
br i1 %cond, label %loop, label %ret
|
|
|
|
ret:
|
|
store volatile <32 x bfloat> %add, ptr addrspace(3) undef
|
|
ret void
|
|
}
|
|
|