As older waves execute long sequences of VALU instructions, this may prevent younger waves from address calculation and then issuing their VMEM loads, which in turn leads the VALU unit to idle. This patch tries to prevent this by temporarily raising the wave's priority. Reviewed By: foad Differential Revision: https://reviews.llvm.org/D124246
154 lines
4.6 KiB
LLVM
154 lines
4.6 KiB
LLVM
; RUN: llc -mtriple=amdgcn -amdgpu-set-wave-priority=true -o - %s | \
|
|
; RUN: FileCheck %s
|
|
|
|
; CHECK-LABEL: no_setprio:
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: ; return to shader part epilog
|
|
define amdgpu_ps <2 x float> @no_setprio() {
|
|
ret <2 x float> <float 0.0, float 0.0>
|
|
}
|
|
|
|
; CHECK-LABEL: vmem_in_exit_block:
|
|
; CHECK: s_setprio 3
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: ; return to shader part epilog
|
|
define amdgpu_ps <2 x float> @vmem_in_exit_block(<4 x i32> inreg %p) {
|
|
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
|
|
ret <2 x float> %v
|
|
}
|
|
|
|
; CHECK-LABEL: branch:
|
|
; CHECK: s_setprio 3
|
|
; CHECK: s_cbranch_scc0 [[A:.*]]
|
|
; CHECK: {{.*}}: ; %b
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: s_branch [[EXIT:.*]]
|
|
; CHECK: [[A]]: ; %a
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: s_branch [[EXIT]]
|
|
; CHECK-NEXT: [[EXIT]]:
|
|
define amdgpu_ps <2 x float> @branch(<4 x i32> inreg %p, i32 inreg %i) {
|
|
%cond = icmp eq i32 %i, 0
|
|
br i1 %cond, label %a, label %b
|
|
|
|
a:
|
|
ret <2 x float> <float 0.0, float 0.0>
|
|
|
|
b:
|
|
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
|
|
ret <2 x float> %v
|
|
}
|
|
|
|
; CHECK-LABEL: setprio_follows_setprio:
|
|
; CHECK: s_setprio 3
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK: s_cbranch_scc1 [[C:.*]]
|
|
; CHECK: {{.*}}: ; %a
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: s_cbranch_scc1 [[C]]
|
|
; CHECK: {{.*}}: ; %b
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: s_branch [[EXIT:.*]]
|
|
; CHECK: [[C]]: ; %c
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: s_branch [[EXIT]]
|
|
; CHECK: [[EXIT]]:
|
|
define amdgpu_ps <2 x float> @setprio_follows_setprio(<4 x i32> inreg %p, i32 inreg %i) {
|
|
entry:
|
|
%v1 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
|
|
%cond1 = icmp ne i32 %i, 0
|
|
br i1 %cond1, label %a, label %c
|
|
|
|
a:
|
|
%v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
|
|
%cond2 = icmp ne i32 %i, 1
|
|
br i1 %cond2, label %b, label %c
|
|
|
|
b:
|
|
ret <2 x float> %v2
|
|
|
|
c:
|
|
%v3 = phi <2 x float> [%v1, %entry], [%v2, %a]
|
|
%v4 = fadd <2 x float> %v1, %v3
|
|
ret <2 x float> %v4
|
|
}
|
|
|
|
; CHECK-LABEL: loop:
|
|
; CHECK: {{.*}}: ; %entry
|
|
; CHECK: s_setprio 3
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: [[LOOP:.*]]: ; %loop
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: s_cbranch_scc1 [[LOOP]]
|
|
; CHECK-NEXT: {{.*}}: ; %exit
|
|
; CHECK-NEXT: s_setprio 0
|
|
define amdgpu_ps <2 x float> @loop(<4 x i32> inreg %p) {
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%i = phi i32 [0, %entry], [%i2, %loop]
|
|
%sum = phi <2 x float> [<float 0.0, float 0.0>, %entry], [%sum2, %loop]
|
|
|
|
%i2 = add i32 %i, 1
|
|
|
|
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 %i, i32 0, i32 0, i32 0)
|
|
%sum2 = fadd <2 x float> %sum, %v
|
|
|
|
%cond = icmp ult i32 %i2, 5
|
|
br i1 %cond, label %loop, label %exit
|
|
|
|
exit:
|
|
ret <2 x float> %sum2
|
|
}
|
|
|
|
; CHECK-LABEL: edge_split:
|
|
; CHECK: s_setprio 3
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: s_cbranch_scc1 [[ANOTHER_LOAD:.*]]
|
|
; CHECK: {{.*}}: ; %loop.preheader
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: [[LOOP:.*]]: ; %loop
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: s_cbranch_scc1 [[LOOP]]
|
|
; CHECK {{.*}}: ; %exit
|
|
; CHECK-NOT: s_setprio
|
|
; CHECK: s_branch [[RET:.*]]
|
|
; CHECK: [[ANOTHER_LOAD]]: ; %another_load
|
|
; CHECK: buffer_load_dwordx2
|
|
; CHECK-NEXT: s_setprio 0
|
|
; CHECK: s_branch [[RET]]
|
|
; CHECK: [[RET]]:
|
|
define amdgpu_ps <2 x float> @edge_split(<4 x i32> inreg %p, i32 inreg %x) {
|
|
entry:
|
|
%v = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 0, i32 0)
|
|
%cond = icmp ne i32 %x, 0
|
|
br i1 %cond, label %loop, label %another_load
|
|
|
|
loop:
|
|
%i = phi i32 [0, %entry], [%i2, %loop]
|
|
%mul = phi <2 x float> [%v, %entry], [%mul2, %loop]
|
|
|
|
%i2 = add i32 %i, 1
|
|
%mul2 = fmul <2 x float> %mul, %v
|
|
|
|
%cond2 = icmp ult i32 %i2, 5
|
|
br i1 %cond2, label %loop, label %exit
|
|
|
|
exit:
|
|
ret <2 x float> %mul2
|
|
|
|
another_load:
|
|
%v2 = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %p, i32 0, i32 0, i32 1, i32 0)
|
|
%sum = fadd <2 x float> %v, %v2
|
|
ret <2 x float> %sum
|
|
}
|
|
|
|
declare <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32>, i32, i32, i32, i32) nounwind
|