Files
clang-p2996/llvm/test/CodeGen/AMDGPU/wqm.ll
Jay Foad f2c164c815 [AMDGPU] Do not wait for vscnt on function entry and return
SIInsertWaitcnts inserts waitcnt instructions to resolve data
dependencies. The GFX10+ vscnt (VMEM store count) counter is never used
in this way. It is only used to resolve memory dependencies, and that is
handled by SIMemoryLegalizer. Hence there is no need to conservatively
wait for vscnt to be 0 on function entry and before returns.

Differential Revision: https://reviews.llvm.org/D153537
2023-07-04 12:22:38 +01:00

3362 lines
150 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-W64 %s
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10-W32 %s
; Check that WQM isn't triggered by image load/store intrinsics.
define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, i32 %c) {
; GFX9-W64-LABEL: test1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
; GFX9-W64-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: image_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_store v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %tex, i32 15, i32 %c, <8 x i32> %rsrc, i32 0, i32 0)
ret <4 x float> %tex
}
; Check that WQM is triggered by code calculating inputs to image samples and is disabled as soon as possible
define amdgpu_ps <4 x float> @test2(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
; GFX9-W64-LABEL: test2:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_mov_b32 m0, s3
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test2:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 m0, s3
; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%inst23 = extractelement <2 x float> %pos, i32 0
%inst24 = extractelement <2 x float> %pos, i32 1
%inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
%inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
%inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
%inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
%tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
ret <4 x float> %tex
}
; ... but disabled for stores (and, in this simple case, not re-enabled) ...
define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
; GFX9-W64-LABEL: test3:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test3:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex.1 = bitcast <4 x float> %tex to <4 x i32>
%tex.2 = extractelement <4 x i32> %tex.1, i32 0
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %tex, <4 x i32> undef, i32 %tex.2, i32 0, i32 0, i32 0)
ret <4 x float> %tex
}
define amdgpu_ps <4 x float> @test3_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %c) {
; GFX9-W64-LABEL: test3_ptr_buf:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test3_ptr_buf:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex.1 = bitcast <4 x float> %tex to <4 x i32>
%tex.2 = extractelement <4 x i32> %tex.1, i32 0
call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> %tex, ptr addrspace(8) undef, i32 %tex.2, i32 0, i32 0, i32 0)
ret <4 x float> %tex
}
; ... and disabled for export.
define amdgpu_ps void @test3x(i32 inreg, i32 inreg, i32 inreg, i32 inreg %m0, <8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <2 x float> %pos) #6 {
; GFX9-W64-LABEL: test3x:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_mov_b32 m0, s3
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
; GFX9-W64-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
; GFX9-W64-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
; GFX9-W64-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX9-W64-NEXT: s_endpgm
;
; GFX10-W32-LABEL: test3x:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 m0, s3
; GFX10-W32-NEXT: v_interp_p1_f32_e32 v2, v0, attr0.x
; GFX10-W32-NEXT: v_interp_p1_f32_e32 v3, v0, attr0.y
; GFX10-W32-NEXT: v_interp_p2_f32_e32 v2, v1, attr0.x
; GFX10-W32-NEXT: v_interp_p2_f32_e32 v3, v1, attr0.y
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: image_sample v[0:3], v[2:3], s[4:11], s[12:15] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: exp mrt0 v0, v1, v2, v3 done vm
; GFX10-W32-NEXT: s_endpgm
main_body:
%inst23 = extractelement <2 x float> %pos, i32 0
%inst24 = extractelement <2 x float> %pos, i32 1
%inst25 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 0, i32 0, i32 %m0)
%inst26 = tail call float @llvm.amdgcn.interp.p2(float %inst25, float %inst24, i32 0, i32 0, i32 %m0)
%inst28 = tail call float @llvm.amdgcn.interp.p1(float %inst23, i32 1, i32 0, i32 %m0)
%inst29 = tail call float @llvm.amdgcn.interp.p2(float %inst28, float %inst24, i32 1, i32 0, i32 %m0)
%tex = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float %inst26, float %inst29, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex.0 = extractelement <4 x float> %tex, i32 0
%tex.1 = extractelement <4 x float> %tex, i32 1
%tex.2 = extractelement <4 x float> %tex, i32 2
%tex.3 = extractelement <4 x float> %tex, i32 3
call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex.0, float %tex.1, float %tex.2, float %tex.3, i1 true, i1 true)
ret void
}
; Check that WQM is re-enabled when required.
define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
; GFX9-W64-LABEL: test4:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1
; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test4:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1
; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%c.1 = mul i32 %c, %d
call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i32 0, i32 0)
%c.1.bc = bitcast i32 %c.1 to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
ret <4 x float> %dtex
}
define amdgpu_ps <4 x float> @test4_ptr_buf(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, i32 %c, i32 %d, float %data) {
; GFX9-W64-LABEL: test4_ptr_buf:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mul_lo_u32 v4, v0, v1
; GFX9-W64-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test4_ptr_buf:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mul_lo_u32 v4, v0, v1
; GFX10-W32-NEXT: image_sample v0, v4, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%c.1 = mul i32 %c, %d
call void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float> undef, ptr addrspace(8) undef, i32 %c.1, i32 0, i32 0, i32 0)
%c.1.bc = bitcast i32 %c.1 to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
ret <4 x float> %dtex
}
; Check that WQM is triggered by the wqm intrinsic.
; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
; does not happen - the v_add should write the return reg directly.
define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test5:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test5:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
ret float %out.0
}
define amdgpu_ps float @test5_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test5_ptr_buf:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test5_ptr_buf:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
ret float %out.0
}
; Check that the wqm intrinsic works correctly for integers.
define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test6:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test6:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = bitcast float %out to i32
%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
%out.2 = bitcast i32 %out.1 to float
ret float %out.2
}
define amdgpu_ps float @test6_ptr_buf(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test6_ptr_buf:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test6_ptr_buf:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = bitcast float %out to i32
%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
%out.2 = bitcast i32 %out.1 to float
ret float %out.2
}
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
; Check that WWM is triggered by the wwm intrinsic.
define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_wwm1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
ret float %out.0
}
; Same as above, but with an integer type.
define amdgpu_ps float @test_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_wwm2:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm2:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%src0.0 = bitcast float %src0 to i32
%src1.0 = bitcast float %src1 to i32
%out = add i32 %src0.0, %src1.0
%out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
%out.1 = bitcast i32 %out.0 to float
ret float %out.1
}
; Check that we don't leave WWM on for computations that don't require WWM,
; since that will lead clobbering things that aren't supposed to be clobbered
; in cases like this.
; We enforce this by checking that v_add gets emitted in the same block as
; WWM computations.
define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
; GFX9-W64-LABEL: test_wwm3:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB13_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
; GFX9-W64-NEXT: .LBB13_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm3:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB13_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-W32-NEXT: .LBB13_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%out = fadd float %src, %src
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
%out.1 = fadd float %src, %out.0
br label %endif
endif:
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
ret float %out.2
}
; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
; write could clobber disabled channels in the non-WWM one.
; We enforce this by checking that v_mov gets emitted in the same block as
; WWM computations.
define amdgpu_ps float @test_wwm4(i32 inreg %idx) {
; GFX9-W64-LABEL: test_wwm4:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB14_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: .LBB14_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm4:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB14_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: .LBB14_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%out = fadd float %src, %src
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
br label %endif
endif:
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
ret float %out.1
}
; Make sure the transition from Exact to WWM then WQM works properly.
define amdgpu_ps float @test_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_wwm5:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm5:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%temp = fadd float %src1, %src1
%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
%out = fadd float %temp.0, %temp.0
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
ret float %out.0
}
; Check that WWM is turned on correctly across basic block boundaries.
; if..then..endif version
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
define amdgpu_ps float @test_wwm6_then() {
; GFX9-W64-LABEL: test_wwm6_then:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB16_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: .LBB16_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm6_then:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB16_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: .LBB16_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = load volatile float, ptr addrspace(1) undef
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src1 = load volatile float, ptr addrspace(1) undef
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
br label %endif
endif:
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
ret float %out.1
}
; Check that WWM is turned on correctly across basic block boundaries.
; loop version
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
define amdgpu_ps float @test_wwm6_loop() {
; GFX9-W64-LABEL: test_wwm6_loop:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
; GFX9-W64-NEXT: .LBB17_1: ; %loop
; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_cbranch_execnz .LBB17_1
; GFX9-W64-NEXT: ; %bb.2: ; %endloop
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm6_loop:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: s_mov_b32 s0, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
; GFX10-W32-NEXT: .LBB17_1: ; %loop
; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_cbranch_execnz .LBB17_1
; GFX10-W32-NEXT: ; %bb.2: ; %endloop
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = load volatile float, ptr addrspace(1) undef
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
br label %loop
loop:
%counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
%src1 = load volatile float, ptr addrspace(1) undef
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
%counter.1 = sub i32 %counter, 1
%cc = icmp ne i32 %counter.1, 0
br i1 %cc, label %loop, label %endloop
endloop:
ret float %out.0
}
; Check that @llvm.amdgcn.set.inactive disables WWM.
define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
; GFX9-W64-LABEL: test_wwm_set_inactive1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_endpgm
;
; GFX10-W32-LABEL: test_wwm_set_inactive1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_endpgm
main_body:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%src.0 = bitcast float %src to i32
%src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
%out = add i32 %src.1, %src.1
%out.0 = call i32 @llvm.amdgcn.wwm.i32(i32 %out)
%out.1 = bitcast i32 %out.0 to float
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
ret void
}
; Check that Strict WQM is triggered by the strict_wqm intrinsic.
define amdgpu_ps float @test_strict_wqm1(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_strict_wqm1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
ret float %out.0
}
; Same as above, but with an integer type.
define amdgpu_ps float @test_strict_wqm2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_strict_wqm2:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm2:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%src0.0 = bitcast float %src0 to i32
%src1.0 = bitcast float %src1 to i32
%out = add i32 %src0.0, %src1.0
%out.0 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %out)
%out.1 = bitcast i32 %out.0 to float
ret float %out.1
}
; Check that we don't leave Strict WQM on for computations that don't require it,
; since that will lead clobbering things that aren't supposed to be clobbered
; in cases like this.
; We enforce this by checking that v_add gets emitted in the same block as
; WWM computations.
define amdgpu_ps float @test_strict_wqm3(i32 inreg %idx) {
; GFX9-W64-LABEL: test_strict_wqm3:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB21_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
; GFX9-W64-NEXT: .LBB21_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm3:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB21_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-W32-NEXT: .LBB21_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%out = fadd float %src, %src
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
%out.1 = fadd float %src, %out.0
br label %endif
endif:
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
ret float %out.2
}
; Check that Strict WQM writes aren't coalesced with non-strict writes, since
; the Strict WQM write could clobber disabled channels in the non-strict one.
; We enforce this by checking that v_mov gets emitted in the same block as
; WWM computations.
define amdgpu_ps float @test_strict_wqm4(i32 inreg %idx) {
; GFX9-W64-LABEL: test_strict_wqm4:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB22_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: .LBB22_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm4:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB22_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: .LBB22_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%out = fadd float %src, %src
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
br label %endif
endif:
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
ret float %out.1
}
; Make sure the transition from Exact to Strict WQM then WQM works properly.
define amdgpu_ps float @test_strict_wqm5(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_strict_wqm5:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 s[4:5], exec
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm5:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%temp = fadd float %src1, %src1
%temp.0 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
%out = fadd float %temp.0, %temp.0
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
ret float %out.0
}
; Check that Strict WQM is turned on correctly across basic block boundaries.
; if..then..endif version
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
define amdgpu_ps float @test_strict_wqm6_then() {
; GFX9-W64-LABEL: test_strict_wqm6_then:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB24_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: .LBB24_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm6_then:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB24_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: .LBB24_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = load volatile float, ptr addrspace(1) undef
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src1 = load volatile float, ptr addrspace(1) undef
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
br label %endif
endif:
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
ret float %out.1
}
; Check that Strict WQM is turned on correctly across basic block boundaries.
; loop version
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
define amdgpu_ps float @test_strict_wqm6_loop() {
; GFX9-W64-LABEL: test_strict_wqm6_loop:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
; GFX9-W64-NEXT: .LBB25_1: ; %loop
; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_cbranch_execnz .LBB25_1
; GFX9-W64-NEXT: ; %bb.2: ; %endloop
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm6_loop:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: s_mov_b32 s0, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
; GFX10-W32-NEXT: .LBB25_1: ; %loop
; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_cbranch_execnz .LBB25_1
; GFX10-W32-NEXT: ; %bb.2: ; %endloop
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = load volatile float, ptr addrspace(1) undef
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
br label %loop
loop:
%counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
%src1 = load volatile float, ptr addrspace(1) undef
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.strict.wqm.f32(float %out)
%counter.1 = sub i32 %counter, 1
%cc = icmp ne i32 %counter.1, 0
br i1 %cc, label %loop, label %endloop
endloop:
ret float %out.0
}
; Check that enabling WQM anywhere enables WQM for the set.inactive source.
define amdgpu_ps void @test_set_inactive2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_set_inactive2:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $scc killed $exec
; GFX9-W64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_u32_e32 v1, v2, v1
; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_endpgm
;
; GFX10-W32-LABEL: test_set_inactive2:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $scc killed $exec
; GFX10-W32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_endpgm
main_body:
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%src1.0 = bitcast float %src1 to i32
%src1.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src1.0, i32 undef)
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src0.0 = bitcast float %src0 to i32
%src0.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %src0.0)
%out = add i32 %src0.1, %src1.1
%out.0 = bitcast i32 %out to float
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.0, ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
ret void
}
; Check a case of one branch of an if-else requiring WQM, the other requiring
; exact.
; Note: In this particular case, the save-and-restore could be avoided if the
; analysis understood that the two branches of the if-else are mutually
; exclusive.
define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
; GFX9-W64-LABEL: test_control_flow_0:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX9-W64-NEXT: s_cbranch_execz .LBB27_2
; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: ; implicit-def: $vgpr0
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: .LBB27_2: ; %Flow
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
; GFX9-W64-NEXT: s_cbranch_execz .LBB27_4
; GFX9-W64-NEXT: ; %bb.3: ; %IF
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: .LBB27_4: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_control_flow_0:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB27_2
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: .LBB27_2: ; %Flow
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB27_4
; GFX10-W32-NEXT: ; %bb.3: ; %IF
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: .LBB27_4: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ELSE
IF:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%data.if = extractelement <4 x float> %dtex, i32 0
br label %END
ELSE:
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
br label %END
END:
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
ret float %r
}
; Reverse branch order compared to the previous test.
define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
; GFX9-W64-LABEL: test_control_flow_1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX9-W64-NEXT: s_cbranch_execz .LBB28_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: ; implicit-def: $vgpr0
; GFX9-W64-NEXT: .LBB28_2: ; %Flow
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_and_b64 s[0:1], exec, s[0:1]
; GFX9-W64-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_cbranch_execz .LBB28_4
; GFX9-W64-NEXT: ; %bb.3: ; %ELSE
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: .LBB28_4: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_control_flow_1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_ne_u32_e32 0, v1
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: s_cbranch_execz .LBB28_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
; GFX10-W32-NEXT: .LBB28_2: ; %Flow
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_and_b32 s0, exec_lo, s0
; GFX10-W32-NEXT: s_xor_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_cbranch_execz .LBB28_4
; GFX10-W32-NEXT: ; %bb.3: ; %ELSE
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: .LBB28_4: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %ELSE, label %IF
IF:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%data.if = extractelement <4 x float> %dtex, i32 0
br label %END
ELSE:
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 %c, i32 0, i32 0, i32 0)
br label %END
END:
%r = phi float [ %data.if, %IF ], [ %data, %ELSE ]
ret float %r
}
; Check that branch conditions are properly marked as needing WQM...
define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <3 x i32> %idx, <2 x float> %data, i32 %coord) {
; GFX9-W64-LABEL: test_control_flow_2:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; GFX9-W64-NEXT: ; implicit-def: $vgpr0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_xor_b64 s[14:15], exec, s[14:15]
; GFX9-W64-NEXT: ; %bb.1: ; %ELSE
; GFX9-W64-NEXT: v_lshlrev_b32_e32 v0, 2, v5
; GFX9-W64-NEXT: ; implicit-def: $vgpr5
; GFX9-W64-NEXT: ; %bb.2: ; %Flow
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[14:15], s[14:15]
; GFX9-W64-NEXT: ; %bb.3: ; %IF
; GFX9-W64-NEXT: v_mul_lo_u32 v0, v5, 3
; GFX9-W64-NEXT: ; %bb.4: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_control_flow_2:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: buffer_store_dword v3, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cmp_nlt_f32_e32 vcc_lo, 0, v0
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: buffer_store_dword v4, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_xor_b32 s13, exec_lo, s13
; GFX10-W32-NEXT: ; %bb.1: ; %ELSE
; GFX10-W32-NEXT: v_lshlrev_b32_e32 v0, 2, v5
; GFX10-W32-NEXT: ; implicit-def: $vgpr5
; GFX10-W32-NEXT: ; %bb.2: ; %Flow
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s13, s13
; GFX10-W32-NEXT: ; %bb.3: ; %IF
; GFX10-W32-NEXT: v_mul_lo_u32 v0, v5, 3
; GFX10-W32-NEXT: ; %bb.4: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%idx.1 = extractelement <3 x i32> %idx, i32 0
%data.1 = extractelement <2 x float> %data, i32 0
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
; The load that determines the branch (and should therefore be WQM) is
; surrounded by stores that require disabled WQM.
%idx.2 = extractelement <3 x i32> %idx, i32 1
%z = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx.2, i32 0, i32 0, i32 0)
%idx.3 = extractelement <3 x i32> %idx, i32 2
%data.3 = extractelement <2 x float> %data, i32 1
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.3, ptr addrspace(8) undef, i32 %idx.3, i32 0, i32 0, i32 0)
%cc = fcmp ogt float %z, 0.0
br i1 %cc, label %IF, label %ELSE
IF:
%coord.IF = mul i32 %coord, 3
br label %END
ELSE:
%coord.ELSE = mul i32 %coord, 4
br label %END
END:
%coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
%coord.END.bc = bitcast i32 %coord.END to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
ret <4 x float> %tex
}
; ... but only if they really do need it.
define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
; GFX9-W64-LABEL: test_control_flow_3:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v1
; GFX9-W64-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: ; implicit-def: $vgpr0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-W64-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_3
; GFX9-W64-NEXT: ; %bb.1: ; %Flow
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX9-W64-NEXT: s_cbranch_execnz .LBB30_4
; GFX9-W64-NEXT: .LBB30_2: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_branch .LBB30_5
; GFX9-W64-NEXT: .LBB30_3: ; %ELSE
; GFX9-W64-NEXT: v_mul_f32_e32 v0, 4.0, v1
; GFX9-W64-NEXT: ; implicit-def: $vgpr1
; GFX9-W64-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX9-W64-NEXT: s_cbranch_execz .LBB30_2
; GFX9-W64-NEXT: .LBB30_4: ; %IF
; GFX9-W64-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_branch .LBB30_5
; GFX9-W64-NEXT: .LBB30_5:
;
; GFX10-W32-LABEL: test_control_flow_3:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v1, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v1, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: ; implicit-def: $vgpr0
; GFX10-W32-NEXT: v_cmpx_nlt_f32_e32 0, v1
; GFX10-W32-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_3
; GFX10-W32-NEXT: ; %bb.1: ; %Flow
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
; GFX10-W32-NEXT: s_cbranch_execnz .LBB30_4
; GFX10-W32-NEXT: .LBB30_2: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_branch .LBB30_5
; GFX10-W32-NEXT: .LBB30_3: ; %ELSE
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 4.0, v1
; GFX10-W32-NEXT: ; implicit-def: $vgpr1
; GFX10-W32-NEXT: s_andn2_saveexec_b32 s0, s0
; GFX10-W32-NEXT: s_cbranch_execz .LBB30_2
; GFX10-W32-NEXT: .LBB30_4: ; %IF
; GFX10-W32-NEXT: v_mul_f32_e32 v0, 0x40400000, v1
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_branch .LBB30_5
; GFX10-W32-NEXT: .LBB30_5:
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dtex.1 = extractelement <4 x float> %dtex, i32 0
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %dtex.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%cc = fcmp ogt float %dtex.1, 0.0
br i1 %cc, label %IF, label %ELSE
IF:
%tex.IF = fmul float %dtex.1, 3.0
br label %END
ELSE:
%tex.ELSE = fmul float %dtex.1, 4.0
br label %END
END:
%tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ]
ret float %tex.END
}
; Another test that failed at some point because of terminator handling.
define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
; GFX9-W64-LABEL: test_control_flow_4:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB31_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
; GFX9-W64-NEXT: s_and_saveexec_b64 s[16:17], s[12:13]
; GFX9-W64-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, 1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[16:17]
; GFX9-W64-NEXT: .LBB31_2: ; %END
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_control_flow_4:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB31_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: s_and_saveexec_b32 s14, s12
; GFX10-W32-NEXT: buffer_load_dword v1, off, s[0:3], 0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, 1
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v1, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s14
; GFX10-W32-NEXT: .LBB31_2: ; %END
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cond = icmp eq i32 %y, 0
br i1 %cond, label %IF, label %END
IF:
%data = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
br label %END
END:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
ret <4 x float> %dtex
}
; Kill is performed in WQM mode so that uniform kill behaves correctly ...
define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, ptr addrspace(1) inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
; GFX9-W64-LABEL: test_kill_0:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v6
; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB32_2
; GFX9-W64-NEXT: ; %bb.1: ; %main_body
; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
; GFX9-W64-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v7, v11
; GFX9-W64-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: v_add_f32_e32 v1, v8, v12
; GFX9-W64-NEXT: v_add_f32_e32 v2, v9, v13
; GFX9-W64-NEXT: v_add_f32_e32 v3, v10, v14
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_branch .LBB32_3
; GFX9-W64-NEXT: .LBB32_2:
; GFX9-W64-NEXT: s_mov_b64 exec, 0
; GFX9-W64-NEXT: exp null off, off, off, off done vm
; GFX9-W64-NEXT: s_endpgm
; GFX9-W64-NEXT: .LBB32_3:
;
; GFX10-W32-LABEL: test_kill_0:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: image_sample v[7:10], v4, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v6
; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB32_2
; GFX10-W32-NEXT: ; %bb.1: ; %main_body
; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
; GFX10-W32-NEXT: image_sample v0, v5, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[11:14], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: buffer_store_dword v3, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v4, v8, v12
; GFX10-W32-NEXT: v_add_f32_e32 v5, v10, v14
; GFX10-W32-NEXT: v_add_f32_e32 v0, v7, v11
; GFX10-W32-NEXT: v_add_f32_e32 v2, v9, v13
; GFX10-W32-NEXT: v_mov_b32_e32 v1, v4
; GFX10-W32-NEXT: v_mov_b32_e32 v3, v5
; GFX10-W32-NEXT: s_branch .LBB32_3
; GFX10-W32-NEXT: .LBB32_2:
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
; GFX10-W32-NEXT: exp null off, off, off, off done vm
; GFX10-W32-NEXT: s_endpgm
; GFX10-W32-NEXT: .LBB32_3:
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%idx.0 = extractelement <2 x i32> %idx, i32 0
%data.0 = extractelement <2 x float> %data, i32 0
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.0, ptr addrspace(8) undef, i32 %idx.0, i32 0, i32 0, i32 0)
%z.cmp = fcmp olt float %z, 0.0
call void @llvm.amdgcn.kill(i1 %z.cmp)
%idx.1 = extractelement <2 x i32> %idx, i32 1
%data.1 = extractelement <2 x float> %data, i32 1
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data.1, ptr addrspace(8) undef, i32 %idx.1, i32 0, i32 0, i32 0)
%tex2 = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex2.0 = extractelement <4 x float> %tex2, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex2.0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%out = fadd <4 x float> %tex, %dtex
ret <4 x float> %out
}
; ... but only if WQM is necessary.
define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
; GFX9-W64-LABEL: test_kill_1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0
; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
; GFX9-W64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v4
; GFX9-W64-NEXT: s_andn2_b64 s[12:13], s[12:13], vcc
; GFX9-W64-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB33_2
; GFX9-W64-NEXT: ; %bb.1: ; %main_body
; GFX9-W64-NEXT: s_andn2_b64 exec, exec, vcc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_branch .LBB33_3
; GFX9-W64-NEXT: .LBB33_2:
; GFX9-W64-NEXT: s_mov_b64 exec, 0
; GFX9-W64-NEXT: exp null off, off, off, off done vm
; GFX9-W64-NEXT: s_endpgm
; GFX9-W64-NEXT: .LBB33_3:
;
; GFX10-W32-LABEL: test_kill_1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0
; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v4
; GFX10-W32-NEXT: buffer_store_dword v5, off, s[0:3], 0
; GFX10-W32-NEXT: s_andn2_b32 s12, s12, vcc_lo
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB33_2
; GFX10-W32-NEXT: ; %bb.1: ; %main_body
; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_branch .LBB33_3
; GFX10-W32-NEXT: .LBB33_2:
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
; GFX10-W32-NEXT: exp null off, off, off, off done vm
; GFX10-W32-NEXT: s_endpgm
; GFX10-W32-NEXT: .LBB33_3:
main_body:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
%z.cmp = fcmp olt float %z, 0.0
call void @llvm.amdgcn.kill(i1 %z.cmp)
ret <4 x float> %dtex
}
; Check prolog shaders.
define amdgpu_ps float @test_prolog_1(float %a, float %b) #5 {
; GFX9-W64-LABEL: test_prolog_1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_prolog_1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v1
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%s = fadd float %a, %b
ret float %s
}
define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
; GFX9-W64-LABEL: test_loop_vcc:
; GFX9-W64: ; %bb.0: ; %entry
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v7, v3
; GFX9-W64-NEXT: v_mov_b32_e32 v6, v2
; GFX9-W64-NEXT: v_mov_b32_e32 v5, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: image_store v[4:7], v0, s[0:7] dmask:0xf unorm
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v8, 0
; GFX9-W64-NEXT: s_mov_b32 s4, 0x40e00000
; GFX9-W64-NEXT: s_branch .LBB35_2
; GFX9-W64-NEXT: .LBB35_1: ; %body
; GFX9-W64-NEXT: ; in Loop: Header=BB35_2 Depth=1
; GFX9-W64-NEXT: image_sample v[4:7], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: v_add_f32_e32 v8, 2.0, v8
; GFX9-W64-NEXT: s_cbranch_execz .LBB35_4
; GFX9-W64-NEXT: .LBB35_2: ; %loop
; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v4
; GFX9-W64-NEXT: v_cmp_lt_f32_e32 vcc, s4, v8
; GFX9-W64-NEXT: v_mov_b32_e32 v1, v5
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v6
; GFX9-W64-NEXT: v_mov_b32_e32 v3, v7
; GFX9-W64-NEXT: s_cbranch_vccz .LBB35_1
; GFX9-W64-NEXT: ; %bb.3:
; GFX9-W64-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7
; GFX9-W64-NEXT: ; implicit-def: $vgpr8
; GFX9-W64-NEXT: .LBB35_4: ; %break
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_loop_vcc:
; GFX10-W32: ; %bb.0: ; %entry
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v8, 0
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: image_store v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_branch .LBB35_2
; GFX10-W32-NEXT: .p2align 6
; GFX10-W32-NEXT: .LBB35_1: ; %body
; GFX10-W32-NEXT: ; in Loop: Header=BB35_2 Depth=1
; GFX10-W32-NEXT: image_sample v[0:3], v4, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: v_add_f32_e32 v8, 2.0, v8
; GFX10-W32-NEXT: s_cbranch_execz .LBB35_4
; GFX10-W32-NEXT: .LBB35_2: ; %loop
; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-W32-NEXT: v_cmp_lt_f32_e32 vcc_lo, 0x40e00000, v8
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v7, v3
; GFX10-W32-NEXT: v_mov_b32_e32 v6, v2
; GFX10-W32-NEXT: v_mov_b32_e32 v5, v1
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: s_cbranch_vccz .LBB35_1
; GFX10-W32-NEXT: ; %bb.3:
; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-W32-NEXT: ; implicit-def: $vgpr8
; GFX10-W32-NEXT: .LBB35_4: ; %break
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v4
; GFX10-W32-NEXT: v_mov_b32_e32 v1, v5
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v6
; GFX10-W32-NEXT: v_mov_b32_e32 v3, v7
; GFX10-W32-NEXT: ; return to shader part epilog
entry:
call void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float> %in, i32 15, i32 undef, <8 x i32> undef, i32 0, i32 0)
br label %loop
loop:
%ctr.iv = phi float [ 0.0, %entry ], [ %ctr.next, %body ]
%c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
%cc = fcmp ogt float %ctr.iv, 7.0
br i1 %cc, label %break, label %body
body:
%c.iv0 = extractelement <4 x float> %c.iv, i32 0
%c.next = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.iv0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
%ctr.next = fadd float %ctr.iv, 2.0
br label %loop
break:
ret <4 x float> %c.iv
}
; Only intrinsic stores need exact execution -- other stores do not have
; externally visible effects and may require WQM for correctness.
define amdgpu_ps void @test_alloca(float %data, i32 %a, i32 %idx) nounwind {
; GFX9-W64-LABEL: test_alloca:
; GFX9-W64: ; %bb.0: ; %entry
; GFX9-W64-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX9-W64-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX9-W64-NEXT: s_mov_b32 s10, -1
; GFX9-W64-NEXT: s_mov_b32 s11, 0xe00000
; GFX9-W64-NEXT: s_add_u32 s8, s8, s0
; GFX9-W64-NEXT: s_addc_u32 s9, s9, 0
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v1, 4
; GFX9-W64-NEXT: v_lshl_add_u32 v1, v2, 2, v1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1
; GFX9-W64-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
; GFX9-W64-NEXT: s_endpgm
;
; GFX10-W32-LABEL: test_alloca:
; GFX10-W32: ; %bb.0: ; %entry
; GFX10-W32-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; GFX10-W32-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; GFX10-W32-NEXT: s_mov_b32 s10, -1
; GFX10-W32-NEXT: s_mov_b32 s11, 0x31c16000
; GFX10-W32-NEXT: s_add_u32 s8, s8, s0
; GFX10-W32-NEXT: s_addc_u32 s9, s9, 0
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_lshl_add_u32 v2, v2, 2, 4
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:4
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-W32-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[1:4], v1, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1
; GFX10-W32-NEXT: buffer_store_dword v0, v5, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dwordx4 v[1:4], off, s[0:3], 0
; GFX10-W32-NEXT: s_endpgm
entry:
%array = alloca [32 x i32], align 4, addrspace(5)
call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
store volatile i32 %a, ptr addrspace(5) %array, align 4
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %data, ptr addrspace(8) undef, i32 1, i32 0, i32 0, i32 0)
%c.gep = getelementptr [32 x i32], ptr addrspace(5) %array, i32 0, i32 %idx
%c = load i32, ptr addrspace(5) %c.gep, align 4
%c.bc = bitcast i32 %c to float
%t = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %t, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
ret void
}
; Must return to exact at the end of a non-void returning shader,
; otherwise the EXEC mask exported by the epilog will be wrong. This is true
; even if the shader has no kills, because a kill could have happened in a
; previous shader fragment.
define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
; GFX9-W64-LABEL: test_nonvoid_return:
; GFX9-W64: ; %bb.0:
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_nonvoid_return:
; GFX10-W32: ; %bb.0:
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
ret <4 x float> %dtex
}
define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
; GFX9-W64-LABEL: test_nonvoid_return_unreachable:
; GFX9-W64: ; %bb.0: ; %entry
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1
; GFX9-W64-NEXT: s_and_b64 exec, exec, exec
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB38_2
; GFX9-W64-NEXT: ; %bb.1: ; %else
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_branch .LBB38_3
; GFX9-W64-NEXT: .LBB38_2: ; %if
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: .LBB38_3:
;
; GFX10-W32-LABEL: test_nonvoid_return_unreachable:
; GFX10-W32: ; %bb.0: ; %entry
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, exec_lo
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB38_2
; GFX10-W32-NEXT: ; %bb.1: ; %else
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_branch .LBB38_3
; GFX10-W32-NEXT: .LBB38_2: ; %if
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-W32-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-W32-NEXT: .LBB38_3:
entry:
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float undef, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
%cc = icmp sgt i32 %c, 0
br i1 %cc, label %if, label %else
if:
store volatile <4 x float> %dtex, ptr addrspace(1) undef
unreachable
else:
ret <4 x float> %dtex
}
; Test awareness that s_wqm_b64 clobbers SCC.
define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
; GFX9-W64-LABEL: test_scc:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB39_2
; GFX9-W64-NEXT: ; %bb.1: ; %else
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: v_mov_b32_e32 v1, 1
; GFX9-W64-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: s_cbranch_execz .LBB39_3
; GFX9-W64-NEXT: s_branch .LBB39_4
; GFX9-W64-NEXT: .LBB39_2:
; GFX9-W64-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX9-W64-NEXT: .LBB39_3: ; %if
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf
; GFX9-W64-NEXT: .LBB39_4: ; %end
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v5, 1.0
; GFX9-W64-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_scc:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2
; GFX10-W32-NEXT: ; %bb.1: ; %else
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: v_mov_b32_e32 v1, 1
; GFX10-W32-NEXT: image_sample v[0:3], v[0:1], s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX10-W32-NEXT: s_cbranch_execz .LBB39_3
; GFX10-W32-NEXT: s_branch .LBB39_4
; GFX10-W32-NEXT: .LBB39_2:
; GFX10-W32-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-W32-NEXT: .LBB39_3: ; %if
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[0:3] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: .LBB39_4: ; %end
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: v_mov_b32_e32 v5, 1.0
; GFX10-W32-NEXT: buffer_store_dword v5, v4, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cc = icmp sgt i32 %sel, 0
br i1 %cc, label %if, label %else
if:
%r.if = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float 0.0, <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
br label %end
else:
%r.else = call <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32 15, float 0.0, float bitcast (i32 1 to float), <8 x i32> undef, <4 x i32> undef, i1 false, i32 0, i32 0) #0
br label %end
end:
%r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float 1.0, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
ret <4 x float> %r
}
; Check a case of a block being entirely WQM except for a bit of WWM.
; There was a bug where it forgot to enter and leave WWM.
define amdgpu_ps float @test_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
; GFX9-W64-LABEL: test_wwm_within_wqm:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB40_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0
; GFX9-W64-NEXT: .LBB40_2: ; %ENDIF
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wwm_within_wqm:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB40_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0
; GFX10-W32-NEXT: .LBB40_2: ; %ENDIF
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ENDIF
IF:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dataf = extractelement <4 x float> %dtex, i32 0
%data1 = fptosi float %dataf to i32
%data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
%data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
%data4 = call i32 @llvm.amdgcn.wwm.i32(i32 %data3)
%data4f = sitofp i32 %data4 to float
br label %ENDIF
ENDIF:
%r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
ret float %r
}
; Check that WWM is triggered by the strict_wwm intrinsic.
define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_strict_wwm1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
ret float %out.0
}
; Same as above, but with an integer type.
define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_strict_wwm2:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_u32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm2:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%src0.0 = bitcast float %src0 to i32
%src1.0 = bitcast float %src1 to i32
%out = add i32 %src0.0, %src1.0
%out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
%out.1 = bitcast i32 %out.0 to float
ret float %out.1
}
; Check that we don't leave WWM on for computations that don't require WWM,
; since that will lead clobbering things that aren't supposed to be clobbered
; in cases like this.
; We enforce this by checking that v_add gets emitted in the same block as
; WWM computations.
define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
; GFX9-W64-LABEL: test_strict_wwm3:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB43_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v1, v0
; GFX9-W64-NEXT: .LBB43_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm3:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB43_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_add_f32_e32 v0, v1, v0
; GFX10-W32-NEXT: .LBB43_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%out = fadd float %src, %src
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
%out.1 = fadd float %src, %out.0
br label %endif
endif:
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
ret float %out.2
}
; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
; write could clobber disabled channels in the non-WWM one.
; We enforce this by checking that v_mov gets emitted in the same block as
; WWM computations.
define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
; GFX9-W64-LABEL: test_strict_wwm4:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[2:3], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB44_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: .LBB44_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm4:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s1, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB44_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_or_saveexec_b32 s2, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s2
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: .LBB44_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%out = fadd float %src, %src
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
br label %endif
endif:
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
ret float %out.1
}
; Make sure the transition from Exact to WWM then WQM works properly.
define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
; GFX9-W64-LABEL: test_strict_wwm5:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v1
; GFX9-W64-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[2:3]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm5:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v0, s0
; GFX10-W32-NEXT: s_mov_b32 s2, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v2, v0, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: buffer_load_dword v1, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s2
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %src0, ptr addrspace(8) undef, i32 %idx0, i32 0, i32 0, i32 0)
%src1 = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx1, i32 0, i32 0, i32 0)
%temp = fadd float %src1, %src1
%temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
%out = fadd float %temp.0, %temp.0
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
ret float %out.0
}
; Check that WWM is turned on correctly across basic block boundaries.
; if..then..endif version
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
;SI-CHECK: buffer_load_dword
;VI-CHECK: flat_load_dword
define amdgpu_ps float @test_strict_wwm6_then() {
; GFX9-W64-LABEL: test_strict_wwm6_then:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX9-W64-NEXT: v_cmp_gt_u32_e32 vcc, 16, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB46_2
; GFX9-W64-NEXT: ; %bb.1: ; %if
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v1, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: .LBB46_2: ; %endif
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm6_then:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v0, -1, v0
; GFX10-W32-NEXT: v_cmp_gt_u32_e32 vcc_lo, 16, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB46_2
; GFX10-W32-NEXT: ; %bb.1: ; %if
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v1, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: .LBB46_2: ; %endif
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = load volatile float, ptr addrspace(1) undef
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
%cc = icmp uge i32 %hi, 16
br i1 %cc, label %endif, label %if
if:
%src1 = load volatile float, ptr addrspace(1) undef
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
br label %endif
endif:
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
ret float %out.1
}
; Check that WWM is turned on correctly across basic block boundaries.
; loop version
define amdgpu_ps float @test_strict_wwm6_loop() {
; GFX9-W64-LABEL: test_strict_wwm6_loop:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: global_load_dword v1, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX9-W64-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
; GFX9-W64-NEXT: s_mov_b64 s[0:1], 0
; GFX9-W64-NEXT: .LBB47_1: ; %loop
; GFX9-W64-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: global_load_dword v2, v[3:4], off glc
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: v_add_u32_e32 v3, -1, v3
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_add_f32_e32 v2, v1, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: s_cbranch_execnz .LBB47_1
; GFX9-W64-NEXT: ; %bb.2: ; %endloop
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm6_loop:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: global_load_dword v1, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0
; GFX10-W32-NEXT: s_mov_b32 s0, 0
; GFX10-W32-NEXT: v_mbcnt_hi_u32_b32 v3, -1, v0
; GFX10-W32-NEXT: .LBB47_1: ; %loop
; GFX10-W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: global_load_dword v2, v[3:4], off glc dlc
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_add_nc_u32_e32 v3, -1, v3
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: v_add_f32_e32 v2, v1, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX10-W32-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: s_cbranch_execnz .LBB47_1
; GFX10-W32-NEXT: ; %bb.2: ; %endloop
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%src0 = load volatile float, ptr addrspace(1) undef
; use mbcnt to make sure the branch is divergent
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
br label %loop
loop:
%counter = phi i32 [ %hi, %main_body ], [ %counter.1, %loop ]
%src1 = load volatile float, ptr addrspace(1) undef
%out = fadd float %src0, %src1
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
%counter.1 = sub i32 %counter, 1
%cc = icmp ne i32 %counter.1, 0
br i1 %cc, label %loop, label %endloop
endloop:
ret float %out.0
}
; Check that @llvm.amdgcn.set.inactive disables WWM.
define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
; GFX9-W64-LABEL: test_strict_wwm_set_inactive1:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: v_add_u32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
; GFX9-W64-NEXT: s_endpgm
;
; GFX10-W32-LABEL: test_strict_wwm_set_inactive1:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: buffer_store_dword v2, v1, s[0:3], 0 idxen
; GFX10-W32-NEXT: s_endpgm
main_body:
%src = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
%src.0 = bitcast float %src to i32
%src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
%out = add i32 %src.1, %src.1
%out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
%out.1 = bitcast i32 %out.0 to float
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %out.1, ptr addrspace(8) undef, i32 %idx, i32 0, i32 0, i32 0)
ret void
}
; Check a case of a block being entirely WQM except for a bit of WWM.
; There was a bug where it forgot to enter and leave WWM.
define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
; GFX9-W64-LABEL: test_strict_wwm_within_wqm:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB49_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, 0
; GFX9-W64-NEXT: s_not_b64 exec, exec
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v1, v0
; GFX9-W64-NEXT: .LBB49_2: ; %ENDIF
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v1
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm_within_wqm:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0
; GFX10-W32-NEXT: s_and_saveexec_b32 s13, vcc_lo
; GFX10-W32-NEXT: s_cbranch_execz .LBB49_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, 0
; GFX10-W32-NEXT: s_not_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v1, v0
; GFX10-W32-NEXT: .LBB49_2: ; %ENDIF
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v1
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ENDIF
IF:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dataf = extractelement <4 x float> %dtex, i32 0
%data1 = fptosi float %dataf to i32
%data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
%data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
%data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
%data4f = sitofp i32 %data4 to float
br label %ENDIF
ENDIF:
%r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
ret float %r
}
; Check a case of a block being entirely WQM except for a bit of STRICT WQM.
define amdgpu_ps float @test_strict_wqm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
; GFX9-W64-LABEL: test_strict_wqm_within_wqm:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v2, v0
; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
; GFX9-W64-NEXT: v_mov_b32_e32 v0, 0
; GFX9-W64-NEXT: s_and_saveexec_b64 s[14:15], vcc
; GFX9-W64-NEXT: s_cbranch_execz .LBB50_2
; GFX9-W64-NEXT: ; %bb.1: ; %IF
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX9-W64-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX9-W64-NEXT: .LBB50_2: ; %ENDIF
; GFX9-W64-NEXT: s_or_b64 exec, exec, s[14:15]
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm_within_wqm:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v2, v0
; GFX10-W32-NEXT: v_mov_b32_e32 v0, 0
; GFX10-W32-NEXT: s_mov_b32 s13, exec_lo
; GFX10-W32-NEXT: v_cmpx_eq_u32_e32 0, v1
; GFX10-W32-NEXT: s_cbranch_execz .LBB50_2
; GFX10-W32-NEXT: ; %bb.1: ; %IF
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: image_sample v2, v2, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_cvt_i32_f32_e32 v2, v2
; GFX10-W32-NEXT: ds_swizzle_b32 v2, v2 offset:swizzle(SWAP,2)
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX10-W32-NEXT: .LBB50_2: ; %ENDIF
; GFX10-W32-NEXT: s_or_b32 exec_lo, exec_lo, s13
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
%cmp = icmp eq i32 %z, 0
br i1 %cmp, label %IF, label %ENDIF
IF:
%c.bc = bitcast i32 %c to float
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%tex0 = extractelement <4 x float> %tex, i32 0
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
%dataf = extractelement <4 x float> %dtex, i32 0
%data1 = fptosi float %dataf to i32
%data2 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data1, i32 2079)
%data3 = call i32 @llvm.amdgcn.strict.wqm.i32(i32 %data2)
%data3f = sitofp i32 %data3 to float
br label %ENDIF
ENDIF:
%r = phi float [ %data3f, %IF ], [ 0.0, %main_body ]
ret float %r
}
;TODO: StrictWQM -> WQM transition could be improved. WQM could use the exec from the previous state instead of calling s_wqm again.
define amdgpu_ps float @test_strict_wqm_strict_wwm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, ptr addrspace(8) inreg %res2, float %inp, <8 x i32> inreg %res3) {
; GFX9-W64-LABEL: test_strict_wqm_strict_wwm_wqm:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[28:29], exec
; GFX9-W64-NEXT: s_mov_b32 s19, s17
; GFX9-W64-NEXT: s_mov_b64 s[30:31], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_mov_b32 s23, s5
; GFX9-W64-NEXT: s_mov_b32 s22, s4
; GFX9-W64-NEXT: s_mov_b32 s21, s3
; GFX9-W64-NEXT: s_mov_b32 s20, s2
; GFX9-W64-NEXT: s_mov_b32 s27, s9
; GFX9-W64-NEXT: s_mov_b32 s26, s8
; GFX9-W64-NEXT: s_mov_b32 s25, s7
; GFX9-W64-NEXT: s_mov_b32 s24, s6
; GFX9-W64-NEXT: s_mov_b32 s18, s16
; GFX9-W64-NEXT: s_mov_b32 s17, s15
; GFX9-W64-NEXT: s_mov_b32 s16, s14
; GFX9-W64-NEXT: s_mov_b32 s15, s13
; GFX9-W64-NEXT: s_mov_b32 s14, s12
; GFX9-W64-NEXT: s_mov_b32 s13, s11
; GFX9-W64-NEXT: s_mov_b32 s12, s10
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s1
; GFX9-W64-NEXT: s_mov_b64 exec, s[30:31]
; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v3, s0
; GFX9-W64-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[28:29]
; GFX9-W64-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wqm_strict_wwm_wqm:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s28, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s19, s17
; GFX10-W32-NEXT: s_mov_b32 s29, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s23, s5
; GFX10-W32-NEXT: s_mov_b32 s22, s4
; GFX10-W32-NEXT: s_mov_b32 s21, s3
; GFX10-W32-NEXT: s_mov_b32 s20, s2
; GFX10-W32-NEXT: s_mov_b32 s27, s9
; GFX10-W32-NEXT: s_mov_b32 s26, s8
; GFX10-W32-NEXT: s_mov_b32 s25, s7
; GFX10-W32-NEXT: s_mov_b32 s24, s6
; GFX10-W32-NEXT: s_mov_b32 s18, s16
; GFX10-W32-NEXT: s_mov_b32 s17, s15
; GFX10-W32-NEXT: s_mov_b32 s16, s14
; GFX10-W32-NEXT: s_mov_b32 s15, s13
; GFX10-W32-NEXT: s_mov_b32 s14, s12
; GFX10-W32-NEXT: s_mov_b32 s13, s11
; GFX10-W32-NEXT: s_mov_b32 s12, s10
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s29
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[20:23], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: s_or_saveexec_b32 s1, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v3, s0
; GFX10-W32-NEXT: buffer_load_dword v3, v3, s[24:27], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s1
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s28
; GFX10-W32-NEXT: image_sample v0, v0, s[12:19], s[20:23] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[20:23], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[20:23], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
%reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
%temp = fadd float %reload, %reload
%temp2 = call float @llvm.amdgcn.strict.wqm.f32(float %temp)
%temp3 = fadd float %temp2, %temp2
%reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res2, i32 %idx0, i32 0, i32 0, i32 0)
%temp4 = call float @llvm.amdgcn.strict.wwm.f32(float %reload_wwm)
%temp5 = fadd float %temp3, %temp4
%res.int = ptrtoint ptr addrspace(8) %res to i128
%res.vec = bitcast i128 %res.int to <4 x i32>
%tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res3, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
%out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
ret float %out
}
define amdgpu_ps float @test_strict_wwm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
; GFX9-W64-LABEL: test_strict_wwm_strict_wqm_wqm:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec
; GFX9-W64-NEXT: s_mov_b32 s15, s13
; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_mov_b32 s19, s5
; GFX9-W64-NEXT: s_mov_b32 s18, s4
; GFX9-W64-NEXT: s_mov_b32 s17, s3
; GFX9-W64-NEXT: s_mov_b32 s16, s2
; GFX9-W64-NEXT: s_mov_b32 s14, s12
; GFX9-W64-NEXT: s_mov_b32 s13, s11
; GFX9-W64-NEXT: s_mov_b32 s12, s10
; GFX9-W64-NEXT: s_mov_b32 s11, s9
; GFX9-W64-NEXT: s_mov_b32 s10, s8
; GFX9-W64-NEXT: s_mov_b32 s9, s7
; GFX9-W64-NEXT: s_mov_b32 s8, s6
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_or_saveexec_b64 s[2:3], -1
; GFX9-W64-NEXT: v_mov_b32_e32 v2, s1
; GFX9-W64-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[2:3]
; GFX9-W64-NEXT: s_mov_b64 s[0:1], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_or_saveexec_b64 s[0:1], -1
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v2, v2, v2
; GFX9-W64-NEXT: s_mov_b64 exec, s[0:1]
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, v2
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v3
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v4
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_strict_wwm_strict_wqm_wqm:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s15, s13
; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s19, s5
; GFX10-W32-NEXT: s_mov_b32 s18, s4
; GFX10-W32-NEXT: s_mov_b32 s17, s3
; GFX10-W32-NEXT: s_mov_b32 s16, s2
; GFX10-W32-NEXT: s_mov_b32 s14, s12
; GFX10-W32-NEXT: s_mov_b32 s13, s11
; GFX10-W32-NEXT: s_mov_b32 s12, s10
; GFX10-W32-NEXT: s_mov_b32 s11, s9
; GFX10-W32-NEXT: s_mov_b32 s10, s8
; GFX10-W32-NEXT: s_mov_b32 s9, s7
; GFX10-W32-NEXT: s_mov_b32 s8, s6
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: v_mov_b32_e32 v2, s1
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: buffer_load_dword v2, v2, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_mov_b32 s0, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: buffer_load_dword v3, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_or_saveexec_b32 s0, -1
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v2, v2, v2
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s0
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v0, v2
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v3
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v4
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
%reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
%temp = fadd float %reload, %reload
%temp2 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
%temp3 = fadd float %temp2, %temp2
%reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
%temp4 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
%temp5 = fadd float %temp3, %temp4
%res.int = ptrtoint ptr addrspace(8) %res to i128
%res.vec = bitcast i128 %res.int to <4 x i32>
%tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp5, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
%out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
ret float %out
}
;TODO: WQM -> StrictWQM transition could be improved. StrictWQM could use the exec from the previous state instead of calling s_wqm again.
define amdgpu_ps float @test_wqm_strict_wqm_wqm(i32 inreg %idx0, i32 inreg %idx1, ptr addrspace(8) inreg %res, float %inp, <8 x i32> inreg %res2) {
; GFX9-W64-LABEL: test_wqm_strict_wqm_wqm:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b64 s[20:21], exec
; GFX9-W64-NEXT: s_mov_b32 s15, s13
; GFX9-W64-NEXT: s_mov_b64 s[22:23], exec
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: s_mov_b32 s19, s5
; GFX9-W64-NEXT: s_mov_b32 s18, s4
; GFX9-W64-NEXT: s_mov_b32 s17, s3
; GFX9-W64-NEXT: s_mov_b32 s16, s2
; GFX9-W64-NEXT: s_mov_b32 s14, s12
; GFX9-W64-NEXT: s_mov_b32 s13, s11
; GFX9-W64-NEXT: s_mov_b32 s12, s10
; GFX9-W64-NEXT: s_mov_b32 s11, s9
; GFX9-W64-NEXT: s_mov_b32 s10, s8
; GFX9-W64-NEXT: s_mov_b32 s9, s7
; GFX9-W64-NEXT: s_mov_b32 s8, s6
; GFX9-W64-NEXT: v_mov_b32_e32 v1, s0
; GFX9-W64-NEXT: s_mov_b64 exec, s[22:23]
; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
; GFX9-W64-NEXT: v_mov_b32_e32 v0, s1
; GFX9-W64-NEXT: buffer_load_dword v0, v0, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_nop 0
; GFX9-W64-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(1)
; GFX9-W64-NEXT: v_mov_b32_e32 v3, v2
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v0
; GFX9-W64-NEXT: v_add_f32_e32 v0, v0, v3
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[20:21]
; GFX9-W64-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
; GFX9-W64-NEXT: ; return to shader part epilog
;
; GFX10-W32-LABEL: test_wqm_strict_wqm_wqm:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s20, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s15, s13
; GFX10-W32-NEXT: s_mov_b32 s21, exec_lo
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_mov_b32 s19, s5
; GFX10-W32-NEXT: s_mov_b32 s18, s4
; GFX10-W32-NEXT: s_mov_b32 s17, s3
; GFX10-W32-NEXT: s_mov_b32 s16, s2
; GFX10-W32-NEXT: s_mov_b32 s14, s12
; GFX10-W32-NEXT: s_mov_b32 s13, s11
; GFX10-W32-NEXT: s_mov_b32 s12, s10
; GFX10-W32-NEXT: s_mov_b32 s11, s9
; GFX10-W32-NEXT: s_mov_b32 s10, s8
; GFX10-W32-NEXT: s_mov_b32 s9, s7
; GFX10-W32-NEXT: s_mov_b32 s8, s6
; GFX10-W32-NEXT: v_mov_b32_e32 v1, s0
; GFX10-W32-NEXT: s_mov_b32 exec_lo, s21
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: v_mov_b32_e32 v3, s1
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX10-W32-NEXT: s_clause 0x1
; GFX10-W32-NEXT: buffer_load_dword v0, v3, s[16:19], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v2, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(1)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_mov_b32_e32 v3, v2
; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v0
; GFX10-W32-NEXT: v_add_f32_e32 v0, v0, v3
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s20
; GFX10-W32-NEXT: image_sample v0, v0, s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: buffer_store_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: buffer_load_dword v0, v1, s[16:19], 0 idxen
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
; GFX10-W32-NEXT: ; return to shader part epilog
main_body:
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %inp, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
%reload = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx1, i32 0, i32 0, i32 0)
%temp = fadd float %reload, %reload
%res.int = ptrtoint ptr addrspace(8) %res to i128
%res.vec = bitcast i128 %res.int to <4 x i32>
%tex = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
%temp2 = fadd float %tex, %tex
%reload_wwm = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
%temp3 = call float @llvm.amdgcn.strict.wqm.f32(float %reload_wwm)
%temp4 = fadd float %temp2, %temp3
%tex2 = call float @llvm.amdgcn.image.sample.1d.f32.f32(i32 1, float %temp4, <8 x i32> %res2, <4 x i32> %res.vec, i1 false, i32 0, i32 0)
call void @llvm.amdgcn.struct.ptr.buffer.store.f32(float %tex2, ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
%out = call float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8) %res, i32 %idx0, i32 0, i32 0, i32 0)
ret float %out
}
; Check if the correct VCC register is selected. WQM pass incorrectly uses VCC for
; vector comparisons in Wave32 mode.
define amdgpu_ps void @test_for_deactivating_lanes_in_wave32(ptr addrspace(6) inreg %0) {
; GFX9-W64-LABEL: test_for_deactivating_lanes_in_wave32:
; GFX9-W64: ; %bb.0: ; %main_body
; GFX9-W64-NEXT: s_mov_b32 s3, 0x31016fac
; GFX9-W64-NEXT: s_mov_b32 s2, 32
; GFX9-W64-NEXT: s_mov_b32 s1, 0x8000
; GFX9-W64-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
; GFX9-W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-W64-NEXT: v_cmp_le_f32_e64 vcc, s0, 0
; GFX9-W64-NEXT: s_andn2_b64 s[4:5], exec, vcc
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB54_1
; GFX9-W64-NEXT: s_endpgm
; GFX9-W64-NEXT: .LBB54_1:
; GFX9-W64-NEXT: s_mov_b64 exec, 0
; GFX9-W64-NEXT: exp null off, off, off, off done vm
; GFX9-W64-NEXT: s_endpgm
;
; GFX10-W32-LABEL: test_for_deactivating_lanes_in_wave32:
; GFX10-W32: ; %bb.0: ; %main_body
; GFX10-W32-NEXT: s_mov_b32 s3, 0x31016fac
; GFX10-W32-NEXT: s_mov_b32 s2, 32
; GFX10-W32-NEXT: s_mov_b32 s1, 0x8000
; GFX10-W32-NEXT: s_buffer_load_dword s0, s[0:3], 0x0
; GFX10-W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-W32-NEXT: v_cmp_le_f32_e64 vcc_lo, s0, 0
; GFX10-W32-NEXT: s_andn2_b32 s4, exec_lo, vcc_lo
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB54_1
; GFX10-W32-NEXT: s_endpgm
; GFX10-W32-NEXT: .LBB54_1:
; GFX10-W32-NEXT: s_mov_b32 exec_lo, 0
; GFX10-W32-NEXT: exp null off, off, off, off done vm
; GFX10-W32-NEXT: s_endpgm
main_body:
%1 = ptrtoint ptr addrspace(6) %0 to i32
%2 = insertelement <4 x i32> <i32 poison, i32 32768, i32 32, i32 822177708>, i32 %1, i32 0
%3 = call nsz arcp float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %2, i32 0, i32 0) #3
%4 = fcmp nsz arcp ugt float %3, 0.000000e+00
call void @llvm.amdgcn.kill(i1 %4) #1
ret void
}
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
declare void @llvm.amdgcn.struct.buffer.store.f32(float, <4 x i32>, i32, i32, i32, i32 immarg) #2
declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg) #2
declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32 immarg) #2
declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #2
declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #3
declare float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32>, i32, i32, i32, i32) #3
declare void @llvm.amdgcn.struct.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
declare void @llvm.amdgcn.struct.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32, i32 immarg) #2
declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32 immarg) #2
declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg) #2
declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32) #3
declare float @llvm.amdgcn.struct.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32, i32) #3
declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #3
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
declare float @llvm.amdgcn.image.sample.1d.f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #3
declare void @llvm.amdgcn.kill(i1) #1
declare float @llvm.amdgcn.wqm.f32(float) #3
declare i32 @llvm.amdgcn.wqm.i32(i32) #3
declare float @llvm.amdgcn.strict.wwm.f32(float) #3
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
declare float @llvm.amdgcn.wwm.f32(float) #3
declare i32 @llvm.amdgcn.wwm.i32(i32) #3
declare float @llvm.amdgcn.strict.wqm.f32(float) #3
declare i32 @llvm.amdgcn.strict.wqm.i32(i32) #3
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #3
declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #1
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #2
declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #2
declare i32 @llvm.amdgcn.ds.swizzle(i32, i32)
declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg) #7
attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }
attributes #3 = { nounwind readnone }
attributes #4 = { nounwind readnone convergent }
attributes #5 = { "amdgpu-ps-wqm-outputs" }
attributes #6 = { nounwind "InitialPSInputAddr"="2" }
attributes #7 = { nounwind readnone willreturn }