Files
clang-p2996/llvm/test/CodeGen/AMDGPU/bfi_int.ll
Jay Foad f2c164c815 [AMDGPU] Do not wait for vscnt on function entry and return
SIInsertWaitcnts inserts waitcnt instructions to resolve data
dependencies. The GFX10+ vscnt (VMEM store count) counter is never used
in this way. It is only used to resolve memory dependencies, and that is
handled by SIMemoryLegalizer. Hence there is no need to conservatively
wait for vscnt to be 0 on function entry and before returns.

Differential Revision: https://reviews.llvm.org/D153537
2023-07-04 12:22:38 +01:00

1766 lines
63 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s
; RUN: llc -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s
; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8-GISEL %s
; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10-GISEL %s
; BFI_INT Definition pattern from ISA docs
; (y & x) | (z & ~x)
;
define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_bfi_def_i32:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: s_mov_b32 s1, s5
; GFX7-NEXT: s_andn2_b32 s4, s8, s6
; GFX7-NEXT: s_and_b32 s5, s7, s6
; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_def_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s1, s7, s6
; GFX8-NEXT: s_andn2_b32 s0, s0, s6
; GFX8-NEXT: s_or_b32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_def_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s1, s7, s6
; GFX10-NEXT: s_andn2_b32 s0, s0, s6
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_def_i32:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_and_b32 s1, s7, s6
; GFX8-GISEL-NEXT: s_andn2_b32 s0, s0, s6
; GFX8-GISEL-NEXT: s_or_b32 s0, s0, s1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_def_i32:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s6
; GFX10-GISEL-NEXT: s_andn2_b32 s0, s0, s6
; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
entry:
%0 = xor i32 %x, -1
%1 = and i32 %z, %0
%2 = and i32 %y, %x
%3 = or i32 %1, %2
store i32 %3, ptr addrspace(1) %out
ret void
}
define i32 @v_bfi_def_i32(i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: v_bfi_def_i32:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bfi_def_i32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bfi_def_i32:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bfi_def_i32:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bfi_def_i32:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = xor i32 %x, -1
%1 = and i32 %z, %0
%2 = and i32 %y, %x
%3 = or i32 %1, %2
ret i32 %3
}
; SHA-256 Ch function
; z ^ (x & (y ^ z))
define amdgpu_kernel void @s_bfi_sha256_ch(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: s_xor_b32 s4, s7, s8
; GFX7-NEXT: s_and_b32 s4, s6, s4
; GFX7-NEXT: s_xor_b32 s4, s8, s4
; GFX7-NEXT: s_mov_b32 s1, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_xor_b32 s1, s7, s0
; GFX8-NEXT: s_and_b32 s1, s6, s1
; GFX8-NEXT: s_xor_b32 s0, s0, s1
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_xor_b32 s1, s7, s0
; GFX10-NEXT: s_and_b32 s1, s6, s1
; GFX10-NEXT: s_xor_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8-GISEL-NEXT: s_xor_b32 s1, s7, s0
; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s1
; GFX8-GISEL-NEXT: s_xor_b32 s0, s0, s1
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_xor_b32 s1, s7, s0
; GFX10-GISEL-NEXT: s_and_b32 s1, s6, s1
; GFX10-GISEL-NEXT: s_xor_b32 s0, s0, s1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
entry:
%0 = xor i32 %y, %z
%1 = and i32 %x, %0
%2 = xor i32 %z, %1
store i32 %2, ptr addrspace(1) %out
ret void
}
define i32 @v_bfi_sha256_ch(i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v1, v2
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = xor i32 %y, %z
%1 = and i32 %x, %0
%2 = xor i32 %z, %1
ret i32 %2
}
define amdgpu_ps float @v_s_s_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 inreg %z) {
; GFX7-LABEL: v_s_s_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_bfi_b32 v0, v0, v1, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_s_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_bfi_b32 v0, v0, v1, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_s_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, s1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, s1
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
%and = and i32 %x, %xor0
%xor1 = xor i32 %z, %and
%cast = bitcast i32 %xor1 to float
ret float %cast
}
define amdgpu_ps float @s_v_s_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 inreg %z) {
; GFX7-LABEL: s_v_s_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_bfi_b32 v0, v1, v0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_s_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_bfi_b32 v0, v1, v0, s1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_s_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, s1
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
%and = and i32 %x, %xor0
%xor1 = xor i32 %z, %and
%cast = bitcast i32 %xor1 to float
ret float %cast
}
define amdgpu_ps float @s_s_v_bfi_sha256_ch(i32 inreg %x, i32 inreg %y, i32 %z) {
; GFX7-LABEL: s_s_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_mov_b32_e32 v1, s0
; GFX7-NEXT: v_bfi_b32 v0, v1, s1, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_bfi_b32 v0, v1, s1, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_bfi_b32 v0, s0, s1, v0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s0
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v1, s1, v0
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, s1, v0
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
%and = and i32 %x, %xor0
%xor1 = xor i32 %z, %and
%cast = bitcast i32 %xor1 to float
ret float %cast
}
define amdgpu_ps float @s_v_v_bfi_sha256_ch(i32 inreg %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_v_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_v_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_v_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
%and = and i32 %x, %xor0
%xor1 = xor i32 %z, %and
%cast = bitcast i32 %xor1 to float
ret float %cast
}
define amdgpu_ps float @v_s_v_bfi_sha256_ch(i32 %x, i32 inreg %y, i32 %z) {
; GFX7-LABEL: v_s_v_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_v_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v1
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_v_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_bfi_b32 v0, v0, s0, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v1
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
%and = and i32 %x, %xor0
%xor1 = xor i32 %z, %and
%cast = bitcast i32 %xor1 to float
ret float %cast
}
define amdgpu_ps float @v_v_s_bfi_sha256_ch(i32 %x, i32 %y, i32 inreg %z) {
; GFX7-LABEL: v_v_s_bfi_sha256_ch:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_bfi_b32 v0, v0, v1, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_v_s_bfi_sha256_ch:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_bfi_b32 v0, v0, v1, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_v_s_bfi_sha256_ch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_bfi_b32 v0, v0, v1, s0
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_v_s_bfi_sha256_ch:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v1, s0
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_v_s_bfi_sha256_ch:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v1, s0
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%xor0 = xor i32 %y, %z
%and = and i32 %x, %xor0
%xor1 = xor i32 %z, %and
%cast = bitcast i32 %xor1 to float
ret float %cast
}
; SHA-256 Ma function
; ((x & z) | (y & (x | z)))
define amdgpu_kernel void @s_bfi_sha256_ma(ptr addrspace(1) %out, i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: s_bfi_sha256_ma:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dword s8, s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s1, s5
; GFX7-NEXT: s_or_b32 s5, s6, s8
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: s_and_b32 s4, s6, s8
; GFX7-NEXT: s_and_b32 s5, s7, s5
; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: v_mov_b32_e32 v0, s4
; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_sha256_ma:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: s_and_b32 s1, s6, s0
; GFX8-NEXT: s_or_b32 s0, s6, s0
; GFX8-NEXT: s_and_b32 s0, s7, s0
; GFX8-NEXT: s_or_b32 s0, s1, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: flat_store_dword v[0:1], v2
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_sha256_ma:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_or_b32 s1, s6, s0
; GFX10-NEXT: s_and_b32 s0, s6, s0
; GFX10-NEXT: s_and_b32 s1, s7, s1
; GFX10-NEXT: s_or_b32 s0, s0, s1
; GFX10-NEXT: v_mov_b32_e32 v1, s0
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_sha256_ma:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX8-GISEL-NEXT: s_and_b32 s1, s6, s0
; GFX8-GISEL-NEXT: s_or_b32 s0, s6, s0
; GFX8-GISEL-NEXT: s_and_b32 s0, s7, s0
; GFX8-GISEL-NEXT: s_or_b32 s0, s1, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s5
; GFX8-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_sha256_ma:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dword s0, s[0:1], 0x34
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_or_b32 s1, s6, s0
; GFX10-GISEL-NEXT: s_and_b32 s0, s6, s0
; GFX10-GISEL-NEXT: s_and_b32 s1, s7, s1
; GFX10-GISEL-NEXT: s_or_b32 s0, s0, s1
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: global_store_dword v1, v0, s[4:5]
; GFX10-GISEL-NEXT: s_endpgm
entry:
%0 = and i32 %x, %z
%1 = or i32 %x, %z
%2 = and i32 %y, %1
%3 = or i32 %0, %2
store i32 %3, ptr addrspace(1) %out
ret void
}
define i32 @v_bfi_sha256_ma(i32 %x, i32 %y, i32 %z) {
; GFX7-LABEL: v_bfi_sha256_ma:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX7-NEXT: v_bfi_b32 v0, v0, v2, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bfi_sha256_ma:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX8-NEXT: v_bfi_b32 v0, v0, v2, v1
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bfi_sha256_ma:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-NEXT: v_bfi_b32 v0, v0, v2, v1
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bfi_sha256_ma:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v1
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bfi_sha256_ma:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v1
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%0 = and i32 %x, %z
%1 = or i32 %x, %z
%2 = and i32 %y, %1
%3 = or i32 %0, %2
ret i32 %3
}
define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) {
; GFX7-LABEL: v_bitselect_v2i32_pat1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bitselect_v2i32_pat1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bitselect_v2i32_pat1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bitselect_v2i32_pat1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bitselect_v2i32_pat1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
%xor.0 = xor <2 x i32> %a, %mask
%and = and <2 x i32> %xor.0, %b
%bitselect = xor <2 x i32> %and, %mask
ret <2 x i32> %bitselect
}
define i64 @v_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX7-LABEL: v_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfi_b32 v1, v1, v3, v5
; GFX7-NEXT: v_bfi_b32 v0, v0, v2, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfi_b32 v1, v1, v3, v5
; GFX8-NEXT: v_bfi_b32 v0, v0, v2, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfi_b32 v0, v0, v2, v4
; GFX10-NEXT: v_bfi_b32 v1, v1, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v4
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, v4
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, v5
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
ret i64 %bitselect
}
define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 inreg %mask) {
; GFX7-LABEL: v_s_s_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_s_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s2
; GFX10-NEXT: v_bfi_b32 v1, v1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v2, s3
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, s2
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s1, s3
; GFX10-GISEL-NEXT: ; return to shader part epilog
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 inreg %mask) {
; GFX7-LABEL: s_v_s_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_s_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s2
; GFX10-NEXT: v_bfi_b32 v1, s1, v1, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, s2
; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, s3
; GFX10-GISEL-NEXT: ; return to shader part epilog
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_0(i64 inreg %a, i64 inreg %b, i64 %mask) {
; GFX7-LABEL: s_s_v_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_bfi_b32 v1, s1, v2, v1
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_bfi_b32 v0, s0, v2, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, s1, v2, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_bfi_b32 v0, s0, v2, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, s0, s2, v0
; GFX10-NEXT: v_bfi_b32 v1, s1, s3, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_not_b64 s[0:1], s[0:1]
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s2, v0
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s3, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_not_b64 s[0:1], s[0:1]
; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, s2
; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, s3
; GFX10-GISEL-NEXT: ; return to shader part epilog
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @v_v_s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 inreg %mask) {
; GFX7-LABEL: v_v_s_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX7-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_v_s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX8-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_v_s_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX10-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_v_s_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX10-GISEL-NEXT: ; return to shader part epilog
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @v_s_v_bitselect_i64_pat_0(i64 %a, i64 inreg %b, i64 %mask) {
; GFX7-LABEL: v_s_v_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v3
; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_v_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v3
; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_v_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX10-NEXT: v_bfi_b32 v1, v1, s1, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, s1, v3
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_v_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s1, v3
; GFX10-GISEL-NEXT: ; return to shader part epilog
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_v_v_bitselect_i64_pat_0(i64 inreg %a, i64 %b, i64 %mask) {
; GFX7-LABEL: s_v_v_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v3
; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_v_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v3
; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_v_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX10-NEXT: v_bfi_b32 v1, s1, v1, v3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: s_not_b64 s[0:1], s[0:1]
; GFX8-GISEL-NEXT: v_and_b32_e32 v2, s0, v2
; GFX8-GISEL-NEXT: v_and_b32_e32 v3, s1, v3
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, v0, v2
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, v1, v3
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_v_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_not_b64 s[2:3], s[0:1]
; GFX10-GISEL-NEXT: v_and_b32_e32 v2, s2, v2
; GFX10-GISEL-NEXT: v_and_b32_e32 v3, s3, v3
; GFX10-GISEL-NEXT: v_and_or_b32 v0, s0, v0, v2
; GFX10-GISEL-NEXT: v_and_or_b32 v1, s1, v1, v3
; GFX10-GISEL-NEXT: ; return to shader part epilog
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define i64 @v_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX7-LABEL: v_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bitselect_i64_pat_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
ret i64 %bitselect
}
define amdgpu_ps <2 x float> @v_s_s_bitselect_i64_pat_1(i64 %a, i64 inreg %b, i64 inreg %mask) {
; GFX7-LABEL: v_s_s_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_bfi_b32 v1, s1, v1, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_s_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, s1, v1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_bfi_b32 v0, s0, v0, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_s_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, s0, v0, s2
; GFX10-NEXT: v_bfi_b32 v1, s1, v1, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, v1, s3
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_s_bitselect_i64_pat_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_bfi_b32 v0, s0, v0, s2
; GFX10-GISEL-NEXT: v_bfi_b32 v1, s1, v1, s3
; GFX10-GISEL-NEXT: ; return to shader part epilog
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_s_v_bitselect_i64_pat_1(i64 inreg %a, i64 inreg %b, i64 %mask) {
; GFX7-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: v_bfi_b32 v1, s3, v2, v1
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_bfi_b32 v0, s2, v2, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_bfi_b32 v1, s3, v2, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_bfi_b32 v0, s2, v2, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, s2, s0, v0
; GFX10-NEXT: v_bfi_b32 v1, s3, s1, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, s0, v0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, s1, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_s_v_bitselect_i64_pat_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: v_bfi_b32 v0, s2, s0, v0
; GFX10-GISEL-NEXT: v_bfi_b32 v1, s3, s1, v1
; GFX10-GISEL-NEXT: ; return to shader part epilog
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_v_s_bitselect_i64_pat_1(i64 inreg %a, i64 %b, i64 inreg %mask) {
; GFX7-LABEL: s_v_s_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_bfi_b32 v1, v1, s1, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_s_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_bfi_b32 v1, v1, s1, v2
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_bfi_b32 v0, v0, s0, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_s_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: v_bfi_b32 v0, v0, s0, s2
; GFX10-NEXT: v_bfi_b32 v1, v1, s1, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_s_bitselect_i64_pat_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
; GFX10-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s2, v0
; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s3, v1
; GFX10-GISEL-NEXT: ; return to shader part epilog
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
%cast = bitcast i64 %bitselect to <2 x float>
ret <2 x float> %cast
}
define i64 @v_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX7-LABEL: v_bitselect_i64_pat_2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bitselect_i64_pat_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bitselect_i64_pat_2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX10-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bitselect_i64_pat_2:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bitselect_i64_pat_2:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, v4
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, v5
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
ret i64 %bitselect
}
define i64 @v_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX7-LABEL: v_bfi_sha256_ma_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX7-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX7-NEXT: v_bfi_b32 v1, v1, v5, v3
; GFX7-NEXT: v_bfi_b32 v0, v0, v4, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-NEXT: v_bfi_b32 v1, v1, v5, v3
; GFX8-NEXT: v_bfi_b32 v0, v0, v4, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_bfi_sha256_ma_i64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX10-NEXT: v_bfi_b32 v0, v0, v4, v2
; GFX10-NEXT: v_bfi_b32 v1, v1, v5, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-GISEL-LABEL: v_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v4, v2
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v5, v3
; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: v_bfi_sha256_ma_i64:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v4, v2
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v5, v3
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%and0 = and i64 %x, %z
%or0 = or i64 %x, %z
%and1 = and i64 %y, %or0
%or1 = or i64 %and0, %and1
ret i64 %or1
}
define amdgpu_ps <2 x float> @v_s_s_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 inreg %z) {
; GFX7-LABEL: v_s_s_bfi_sha256_ma_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX7-NEXT: v_mov_b32_e32 v2, s1
; GFX7-NEXT: v_bfi_b32 v1, v1, s3, v2
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_bfi_b32 v0, v0, s2, v2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_s_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-NEXT: v_mov_b32_e32 v2, s1
; GFX8-NEXT: v_bfi_b32 v1, v1, s3, v2
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_bfi_b32 v0, v0, s2, v2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_s_bfi_sha256_ma_i64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX10-NEXT: v_bfi_b32 v0, v0, s2, s0
; GFX10-NEXT: v_bfi_b32 v1, v1, s3, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v2, s1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_s_bfi_sha256_ma_i64:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, s2, s0
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, s3, s1
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%and0 = and i64 %x, %z
%or0 = or i64 %x, %z
%and1 = and i64 %y, %or0
%or1 = or i64 %and0, %and1
%cast = bitcast i64 %or1 to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_v_s_bfi_sha256_ma_i64(i64 inreg %x, i64 %y, i64 inreg %z) {
; GFX7-LABEL: s_v_s_bfi_sha256_ma_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_xor_b32_e32 v2, s1, v1
; GFX7-NEXT: v_bfi_b32 v1, v2, s3, v1
; GFX7-NEXT: v_xor_b32_e32 v2, s0, v0
; GFX7-NEXT: v_bfi_b32 v0, v2, s2, v0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_v_s_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_xor_b32_e32 v2, s1, v1
; GFX8-NEXT: v_bfi_b32 v1, v2, s3, v1
; GFX8-NEXT: v_xor_b32_e32 v2, s0, v0
; GFX8-NEXT: v_bfi_b32 v0, v2, s2, v0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_v_s_bfi_sha256_ma_i64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_xor_b32_e32 v2, s0, v0
; GFX10-NEXT: v_xor_b32_e32 v3, s1, v1
; GFX10-NEXT: v_bfi_b32 v0, v2, s2, v0
; GFX10-NEXT: v_bfi_b32 v1, v3, s3, v1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_and_b64 s[4:5], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX8-GISEL-NEXT: v_and_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_and_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: v_or_b32_e32 v0, s4, v0
; GFX8-GISEL-NEXT: v_or_b32_e32 v1, s5, v1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_v_s_bfi_sha256_ma_i64:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_and_b64 s[4:5], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: v_and_or_b32 v0, v0, s0, s4
; GFX10-GISEL-NEXT: v_and_or_b32 v1, v1, s1, s5
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%and0 = and i64 %x, %z
%or0 = or i64 %x, %z
%and1 = and i64 %y, %or0
%or1 = or i64 %and0, %and1
%cast = bitcast i64 %or1 to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @s_s_v_bfi_sha256_ma_i64(i64 inreg %x, i64 inreg %y, i64 %z) {
; GFX7-LABEL: s_s_v_bfi_sha256_ma_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_mov_b32_e32 v2, s3
; GFX7-NEXT: v_xor_b32_e32 v2, s1, v2
; GFX7-NEXT: v_bfi_b32 v1, v2, v1, s3
; GFX7-NEXT: v_mov_b32_e32 v2, s2
; GFX7-NEXT: v_xor_b32_e32 v2, s0, v2
; GFX7-NEXT: v_bfi_b32 v0, v2, v0, s2
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_s_v_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_mov_b32_e32 v2, s3
; GFX8-NEXT: v_xor_b32_e32 v2, s1, v2
; GFX8-NEXT: v_bfi_b32 v1, v2, v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s2
; GFX8-NEXT: v_xor_b32_e32 v2, s0, v2
; GFX8-NEXT: v_bfi_b32 v0, v2, v0, s2
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_s_v_bfi_sha256_ma_i64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_xor_b32_e64 v2, s0, s2
; GFX10-NEXT: v_xor_b32_e64 v3, s1, s3
; GFX10-NEXT: v_bfi_b32 v0, v2, v0, s2
; GFX10-NEXT: v_bfi_b32 v1, v3, v1, s3
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s2, v2
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2
; GFX8-GISEL-NEXT: v_mov_b32_e32 v2, s1
; GFX8-GISEL-NEXT: v_xor_b32_e32 v2, s3, v2
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v2, v1, s3
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: s_s_v_bfi_sha256_ma_i64:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_xor_b32_e64 v2, s0, s2
; GFX10-GISEL-NEXT: v_xor_b32_e64 v3, s1, s3
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v2, v0, s2
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v3, v1, s3
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%and0 = and i64 %x, %z
%or0 = or i64 %x, %z
%and1 = and i64 %y, %or0
%or1 = or i64 %and0, %and1
%cast = bitcast i64 %or1 to <2 x float>
ret <2 x float> %cast
}
define amdgpu_ps <2 x float> @v_s_v_bfi_sha256_ma_i64(i64 %x, i64 inreg %y, i64 %z) {
; GFX7-LABEL: v_s_v_bfi_sha256_ma_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX7-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX7-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX7-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: v_s_v_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX8-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX8-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: v_s_v_bfi_sha256_ma_i64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX10-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX10-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX10-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX10-NEXT: ; return to shader part epilog
;
; GFX8-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX8-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX8-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX8-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX8-GISEL-NEXT: ; return to shader part epilog
;
; GFX10-GISEL-LABEL: v_s_v_bfi_sha256_ma_i64:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, s0, v0
; GFX10-GISEL-NEXT: v_xor_b32_e32 v1, s1, v1
; GFX10-GISEL-NEXT: v_bfi_b32 v0, v0, v2, s0
; GFX10-GISEL-NEXT: v_bfi_b32 v1, v1, v3, s1
; GFX10-GISEL-NEXT: ; return to shader part epilog
entry:
%and0 = and i64 %x, %z
%or0 = or i64 %x, %z
%and1 = and i64 %y, %or0
%or1 = or i64 %and0, %and1
%cast = bitcast i64 %or1 to <2 x float>
ret <2 x float> %cast
}
define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; GFX7-LABEL: s_bitselect_i64_pat_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7]
; GFX7-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX7-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bitselect_i64_pat_0:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
; GFX8-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bitselect_i64_pat_0:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
; GFX10-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_0:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
; GFX8-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bitselect_i64_pat_0:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[6:7]
; GFX10-GISEL-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-GISEL-NEXT: s_endpgm
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
%and1 = and i64 %not.a, %mask
%bitselect = or i64 %and0, %and1
%scalar.use = add i64 %bitselect, 10
store i64 %scalar.use, ptr addrspace(1) undef
ret void
}
define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GFX7-LABEL: s_bitselect_i64_pat_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bitselect_i64_pat_1:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bitselect_i64_pat_1:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_1:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bitselect_i64_pat_1:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-GISEL-NEXT: s_endpgm
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
%scalar.use = add i64 %bitselect, 10
store i64 %scalar.use, ptr addrspace(1) undef
ret void
}
define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
; GFX7-LABEL: s_bitselect_i64_pat_2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_xor_b64 s[4:5], s[4:5], s[0:1]
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
; GFX7-NEXT: s_xor_b64 s[0:1], s[4:5], s[0:1]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bitselect_i64_pat_2:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX8-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX8-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bitselect_i64_pat_2:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX10-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX10-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bitselect_i64_pat_2:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX8-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bitselect_i64_pat_2:
; GFX10-GISEL: ; %bb.0:
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_xor_b64 s[2:3], s[4:5], s[0:1]
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[2:3], s[6:7]
; GFX10-GISEL-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-GISEL-NEXT: s_endpgm
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
%bitselect = xor i64 %and, %mask
%scalar.use = add i64 %bitselect, 10
store i64 %scalar.use, ptr addrspace(1) undef
ret void
}
define amdgpu_kernel void @s_bfi_sha256_ma_i64(i64 %x, i64 %y, i64 %z) {
; GFX7-LABEL: s_bfi_sha256_ma_i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, -1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_and_b64 s[8:9], s[4:5], s[0:1]
; GFX7-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; GFX7-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX7-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1]
; GFX7-NEXT: s_add_u32 s0, s0, 10
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
; GFX8-LABEL: s_bfi_sha256_ma_i64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
; GFX8-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; GFX8-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-NEXT: s_add_u32 s0, s0, 10
; GFX8-NEXT: s_addc_u32 s1, s1, 0
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-NEXT: s_endpgm
;
; GFX10-LABEL: s_bfi_sha256_ma_i64:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
; GFX10-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
; GFX10-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
; GFX10-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-NEXT: s_add_u32 s0, s0, 10
; GFX10-NEXT: s_addc_u32 s1, s1, 0
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-NEXT: s_endpgm
;
; GFX8-GISEL-LABEL: s_bfi_sha256_ma_i64:
; GFX8-GISEL: ; %bb.0: ; %entry
; GFX8-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX8-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX8-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-GISEL-NEXT: s_and_b64 s[2:3], s[4:5], s[0:1]
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1]
; GFX8-GISEL-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1]
; GFX8-GISEL-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1]
; GFX8-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX8-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX8-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX8-GISEL-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
; GFX8-GISEL-NEXT: s_endpgm
;
; GFX10-GISEL-LABEL: s_bfi_sha256_ma_i64:
; GFX10-GISEL: ; %bb.0: ; %entry
; GFX10-GISEL-NEXT: s_clause 0x1
; GFX10-GISEL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-GISEL-NEXT: s_or_b64 s[2:3], s[4:5], s[0:1]
; GFX10-GISEL-NEXT: s_and_b64 s[0:1], s[4:5], s[0:1]
; GFX10-GISEL-NEXT: s_and_b64 s[2:3], s[6:7], s[2:3]
; GFX10-GISEL-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3]
; GFX10-GISEL-NEXT: s_add_u32 s0, s0, 10
; GFX10-GISEL-NEXT: s_addc_u32 s1, s1, 0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX10-GISEL-NEXT: global_store_dwordx2 v[0:1], v[0:1], off
; GFX10-GISEL-NEXT: s_endpgm
entry:
%and0 = and i64 %x, %z
%or0 = or i64 %x, %z
%and1 = and i64 %y, %or0
%or1 = or i64 %and0, %and1
%scalar.use = add i64 %or1, 10
store i64 %scalar.use, ptr addrspace(1) undef
ret void
}