AMDGPU: Add mode register use to s_getreg_b32

This should fix reading the wrong mode after setting the mode.
Ideally we would have separate pseudos for the case that we know
does not read mode.
This commit is contained in:
Matt Arsenault
2024-05-07 15:46:15 +02:00
committed by Matt Arsenault
parent 9eb91f45fb
commit f548c4d83c
3 changed files with 152 additions and 24 deletions

View File

@@ -1110,14 +1110,15 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
// This is hasSideEffects to allow its use in readcyclecounter selection.
// FIXME: Need to truncate immediate to 16-bits.
// FIXME: Missing mode register use. Should have separate pseudos for
// known may read MODE and only read MODE.
// FIXME: Should have separate pseudos for known may read MODE and
// only read MODE.
def S_GETREG_B32 : SOPK_Pseudo <
"s_getreg_b32",
(outs SReg_32:$sdst), (ins hwreg:$simm16),
"$sdst, $simm16",
[(set i32:$sdst, (int_amdgcn_s_getreg (i32 timm:$simm16)))]> {
let hasSideEffects = 1;
let Uses = [MODE];
}
let Defs = [MODE], Uses = [MODE] in {

View File

@@ -2417,12 +2417,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2455,12 +2455,12 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #2 {
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2727,12 +2727,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -2765,12 +2765,12 @@ define float @v_fdiv_f32_dynamic(float %x, float %y) #2 {
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3294,12 +3294,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3334,12 +3334,12 @@ define float @v_fdiv_f32_dynamic_contractable_user(float %x, float %y, float %z)
; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v4, v3
; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3868,12 +3868,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -3906,12 +3906,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf(float %x, float %y, float %z) #2 {
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -4434,12 +4434,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v4, v3
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-FASTFMA-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -4474,12 +4474,12 @@ define float @v_fdiv_f32_dynamic__nnan_ninf_contractable_user(float %x, float %y
; GFX7-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v4, v3
; GFX7-NEXT: v_div_scale_f32 v5, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v6, -v3, v4, 1.0
; GFX7-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX7-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX7-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v6, v7, v4, v6
; GFX7-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5010,12 +5010,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5048,12 +5048,12 @@ define float @v_fdiv_neglhs_f32_dynamic(float %x, float %y) #2 {
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, -v0, v1, -v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5569,12 +5569,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -5607,12 +5607,12 @@ define float @v_fdiv_negrhs_f32_dynamic(float %x, float %y) #2 {
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], -v1, -v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, -v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6113,12 +6113,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4
; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6153,12 +6153,12 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #2 {
; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], s6, s6, v0
; GFX7-NEXT: v_rcp_f32_e32 v2, v1
; GFX7-NEXT: v_div_scale_f32 v3, vcc, v0, s6, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6619,12 +6619,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v2, v1
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, v4, v2, v2
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v4, v5, v2, v4
; GFX6-FASTFMA-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -6659,12 +6659,12 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #2 {
; GFX7-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, s6
; GFX7-NEXT: v_rcp_f32_e32 v2, v1
; GFX7-NEXT: v_div_scale_f32 v3, vcc, s6, v0, s6
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX7-NEXT: v_fma_f32 v2, v4, v2, v2
; GFX7-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX7-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v4, v5, v2, v4
; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7168,12 +7168,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7206,12 +7206,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_x(float nofpclass(sub) %x, float %y) #
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7721,12 +7721,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
@@ -7759,12 +7759,12 @@ define float @v_fdiv_f32_dynamic_nodenorm_y(float %x, float nofpclass(sub) %y) #
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX7-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX7-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX7-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX7-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX7-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX7-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4

View File

@@ -1661,5 +1661,132 @@ define amdgpu_gfx void @s_set_rounding_select_3_5(i32 inreg %cond) {
ret void
}
define amdgpu_kernel void @get_rounding_after_set_rounding_1() {
; GFX6-LABEL: get_rounding_after_set_rounding_1:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_nop 0
; GFX6-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
; GFX6-NEXT: s_lshl_b32 s2, s0, 2
; GFX6-NEXT: s_mov_b32 s0, 0xeb24da71
; GFX6-NEXT: s_mov_b32 s1, 0xc96f385
; GFX6-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX6-NEXT: s_and_b32 s0, s0, 15
; GFX6-NEXT: s_add_i32 s1, s0, 4
; GFX6-NEXT: s_cmp_lt_u32 s0, 4
; GFX6-NEXT: s_cselect_b32 s4, s0, s1
; GFX6-NEXT: s_mov_b32 s0, 0
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_mov_b32 s1, s0
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: get_rounding_after_set_rounding_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_nop 0
; GFX7-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
; GFX7-NEXT: s_lshl_b32 s2, s0, 2
; GFX7-NEXT: s_mov_b32 s0, 0xeb24da71
; GFX7-NEXT: s_mov_b32 s1, 0xc96f385
; GFX7-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX7-NEXT: s_and_b32 s0, s0, 15
; GFX7-NEXT: s_add_i32 s1, s0, 4
; GFX7-NEXT: s_cmp_lt_u32 s0, 4
; GFX7-NEXT: s_cselect_b32 s4, s0, s1
; GFX7-NEXT: s_mov_b32 s0, 0
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_mov_b32 s1, s0
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: get_rounding_after_set_rounding_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
; GFX8-NEXT: v_mov_b32_e32 v0, 0
; GFX8-NEXT: v_mov_b32_e32 v1, 0
; GFX8-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
; GFX8-NEXT: s_lshl_b32 s2, s0, 2
; GFX8-NEXT: s_mov_b32 s0, 0xeb24da71
; GFX8-NEXT: s_mov_b32 s1, 0xc96f385
; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX8-NEXT: s_and_b32 s0, s0, 15
; GFX8-NEXT: s_add_i32 s1, s0, 4
; GFX8-NEXT: s_cmp_lt_u32 s0, 4
; GFX8-NEXT: s_cselect_b32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_endpgm
;
; GFX9-LABEL: get_rounding_after_set_rounding_1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 0, 4), 0
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
; GFX9-NEXT: s_lshl_b32 s2, s0, 2
; GFX9-NEXT: s_mov_b32 s0, 0xeb24da71
; GFX9-NEXT: s_mov_b32 s1, 0xc96f385
; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX9-NEXT: s_and_b32 s0, s0, 15
; GFX9-NEXT: s_add_i32 s1, s0, 4
; GFX9-NEXT: s_cmp_lt_u32 s0, 4
; GFX9-NEXT: s_cselect_b32 s0, s0, s1
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: global_store_dword v[0:1], v2, off
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: get_rounding_after_set_rounding_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_round_mode 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_lshl_b32 s2, s0, 2
; GFX10-NEXT: s_mov_b32 s0, 0xeb24da71
; GFX10-NEXT: s_mov_b32 s1, 0xc96f385
; GFX10-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX10-NEXT: s_and_b32 s0, s0, 15
; GFX10-NEXT: s_add_i32 s1, s0, 4
; GFX10-NEXT: s_cmp_lt_u32 s0, 4
; GFX10-NEXT: s_cselect_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v2, s0
; GFX10-NEXT: global_store_dword v[0:1], v2, off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: get_rounding_after_set_rounding_1:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_round_mode 0x0
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 0, 4)
; GFX11-NEXT: s_lshl_b32 s2, s0, 2
; GFX11-NEXT: s_mov_b32 s0, 0xeb24da71
; GFX11-NEXT: s_mov_b32 s1, 0xc96f385
; GFX11-NEXT: s_lshr_b64 s[0:1], s[0:1], s2
; GFX11-NEXT: s_and_b32 s0, s0, 15
; GFX11-NEXT: s_add_i32 s1, s0, 4
; GFX11-NEXT: s_cmp_lt_u32 s0, 4
; GFX11-NEXT: s_cselect_b32 s0, s0, s1
; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
; GFX11-NEXT: global_store_b32 v[0:1], v2, off dlc
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
tail call void @llvm.set.rounding(i32 1)
%set.mode = tail call i32 @llvm.get.rounding()
store volatile i32 %set.mode, ptr addrspace(1) null
ret void
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; GCN: {{.*}}