Files
clang-p2996/llvm/test/CodeGen/AMDGPU/usubo.ll
Christudasan Devadasan 229e118559 [AMDGPU] Codegen support for constrained multi-dword sloads (#96163)
For targets that support xnack replay feature (gfx8+), the
multi-dword scalar loads shouldn't clobber any register that
holds the src address. The constrained version of the scalar
loads have the early clobber flag attached to the dst operand
to restrict RA from re-allocating any of the src regs for its
dst operand.
2024-07-23 13:59:15 +05:30

820 lines
30 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) #0 {
; SI-LABEL: s_usubo_i64_zext:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_sub_u32 s4, s6, s8
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: v_mov_b32_e32 v1, s7
; SI-NEXT: s_subb_u32 s5, s7, s9
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i64_zext:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: s_sub_u32 s0, s6, s0
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: s_subb_u32 s1, s7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_usubo_i64_zext:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: s_sub_u32 s0, s6, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s7
; GFX9-NEXT: s_subb_u32 s1, s7, s1
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
%ext = zext i1 %carry to i64
%add2 = add i64 %val, %ext
store i64 %add2, ptr addrspace(1) %out, align 8
ret void
}
; FIXME: Could do scalar
define amdgpu_kernel void @s_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
; SI-LABEL: s_usubo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_load_dwordx2 s[12:13], s[2:3], 0xd
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: v_mov_b32_e32 v0, s13
; SI-NEXT: v_sub_i32_e32 v0, vcc, s12, v0
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v4, s1
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_sub_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_usubo_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, s0, v1
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
store i32 %val, ptr addrspace(1) %out, align 4
store i1 %carry, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @v_usubo_i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_usubo_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_usubo_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
store i32 %val, ptr addrspace(1) %out, align 4
store i1 %carry, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @v_usubo_i32_novcc(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_usubo_i32_novcc:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: buffer_load_dword v0, off, s[12:15], 0
; SI-NEXT: buffer_load_dword v1, off, s[16:19], 0
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ;;#ASMEND
; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_usubo_i32_novcc:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: flat_load_dword v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v5
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; VI-NEXT: flat_store_dword v[0:1], v4
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: ;;#ASMSTART
; VI-NEXT: ;;#ASMEND
; VI-NEXT: flat_store_byte v[2:3], v5
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_i32_novcc:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v1, v0, s[8:9]
; GFX9-NEXT: global_load_dword v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: global_store_byte v0, v2, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%uadd = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %uadd, 0
%carry = extractvalue { i32, i1 } %uadd, 1
store volatile i32 %val, ptr addrspace(1) %out, align 4
call void asm sideeffect "", "~{vcc}"() #0
store volatile i1 %carry, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i64 %a, i64 %b) #0 {
; SI-LABEL: s_usubo_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_u32 s6, s4, s6
; SI-NEXT: s_subb_u32 s7, s5, s7
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s12, s2
; SI-NEXT: s_mov_b32 s13, s3
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: v_mov_b32_e32 v1, s5
; SI-NEXT: v_mov_b32_e32 v2, s6
; SI-NEXT: v_mov_b32_e32 v3, s7
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[6:7], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[12:15], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_sub_u32 s0, s4, s6
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_subb_u32 s1, s5, s7
; VI-NEXT: v_mov_b32_e32 v5, s5
; VI-NEXT: v_mov_b32_e32 v7, s1
; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
; VI-NEXT: v_mov_b32_e32 v6, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_store_dwordx2 v[0:1], v[6:7]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_usubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_sub_u32 s0, s8, s10
; GFX9-NEXT: v_mov_b32_e32 v0, s8
; GFX9-NEXT: v_mov_b32_e32 v1, s9
; GFX9-NEXT: s_subb_u32 s1, s9, s11
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
store i64 %val, ptr addrspace(1) %out, align 8
store i1 %carry, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @v_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_usubo_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v2, vcc, v0, v2
; SI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[8:11], 0
; SI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_usubo_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v2, vcc, v0, v2
; VI-NEXT: v_subb_u32_e32 v3, vcc, v1, v3, vcc
; VI-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; VI-NEXT: flat_store_dwordx2 v[4:5], v[2:3]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[6:7], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[2:3], v[0:1]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: global_store_byte v4, v0, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i64, ptr addrspace(1) %a.ptr
%b.gep = getelementptr inbounds i64, ptr addrspace(1) %b.ptr
%a = load i64, ptr addrspace(1) %a.gep
%b = load i64, ptr addrspace(1) %b.gep
%usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
%val = extractvalue { i64, i1 } %usub, 0
%carry = extractvalue { i64, i1 } %usub, 1
store i64 %val, ptr addrspace(1) %out, align 8
store i1 %carry, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @v_usubo_i16(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_usubo_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: buffer_load_ushort v0, off, s[12:15], 0
; SI-NEXT: buffer_load_ushort v1, off, s[16:19], 0
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
; SI-NEXT: buffer_store_short v0, off, s[8:11], 0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_usubo_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_ushort v4, v[0:1]
; VI-NEXT: flat_load_ushort v5, v[2:3]
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u16_e32 v5, v4, v5
; VI-NEXT: v_cmp_gt_u16_e32 vcc, v5, v4
; VI-NEXT: flat_store_short v[0:1], v5
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: flat_store_byte v[2:3], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_ushort v1, v0, s[8:9]
; GFX9-NEXT: global_load_ushort v2, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_u16_e32 v2, v1, v2
; GFX9-NEXT: v_cmp_gt_u16_e32 vcc, v2, v1
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: global_store_short v0, v2, s[4:5]
; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i16, ptr addrspace(1) %a.ptr
%b.gep = getelementptr inbounds i16, ptr addrspace(1) %b.ptr
%a = load i16, ptr addrspace(1) %a.gep
%b = load i16, ptr addrspace(1) %b.gep
%usub = call { i16, i1 } @llvm.usub.with.overflow.i16(i16 %a, i16 %b)
%val = extractvalue { i16, i1 } %usub, 0
%carry = extractvalue { i16, i1 } %usub, 1
store i16 %val, ptr addrspace(1) %out
store i1 %carry, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @v_usubo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind {
; SI-LABEL: v_usubo_v2i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s11, 0xf000
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_mov_b32 s14, s10
; SI-NEXT: s_mov_b32 s15, s11
; SI-NEXT: s_mov_b32 s18, s10
; SI-NEXT: s_mov_b32 s19, s11
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s12, s4
; SI-NEXT: s_mov_b32 s13, s5
; SI-NEXT: s_mov_b32 s16, s6
; SI-NEXT: s_mov_b32 s17, s7
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0
; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[16:19], 0
; SI-NEXT: s_mov_b32 s6, s10
; SI-NEXT: s_mov_b32 s7, s11
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
; SI-NEXT: s_mov_b32 s4, s2
; SI-NEXT: s_mov_b32 s5, s3
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
; SI-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_usubo_v2i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_mov_b32_e32 v2, s6
; VI-NEXT: v_mov_b32_e32 v3, s7
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v4, s0
; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: v_mov_b32_e32 v6, s2
; VI-NEXT: v_mov_b32_e32 v7, s3
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_sub_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; VI-NEXT: v_sub_u32_e32 v0, vcc, v0, v2
; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1]
; VI-NEXT: flat_store_dwordx2 v[6:7], v[2:3]
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_v2i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[8:9]
; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
; GFX9-NEXT: s_endpgm
%a = load <2 x i32>, ptr addrspace(1) %aptr, align 4
%b = load <2 x i32>, ptr addrspace(1) %bptr, align 4
%sadd = call { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32> %a, <2 x i32> %b) nounwind
%val = extractvalue { <2 x i32>, <2 x i1> } %sadd, 0
%carry = extractvalue { <2 x i32>, <2 x i1> } %sadd, 1
store <2 x i32> %val, ptr addrspace(1) %out, align 4
%carry.ext = zext <2 x i1> %carry to <2 x i32>
store <2 x i32> %carry.ext, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, i32 %a, i32 %b) #0 {
; SI-LABEL: s_usubo_clamp_bit:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_sub_i32_e32 v0, vcc, s0, v0
; SI-NEXT: s_cmp_eq_u32 s0, s1
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_cbranch_scc1 .LBB8_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[0:1], vcc, -1
; SI-NEXT: .LBB8_2: ; %exit
; SI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1]
; SI-NEXT: s_mov_b32 s10, s2
; SI-NEXT: s_mov_b32 s11, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s8, s6
; SI-NEXT: s_mov_b32 s9, s7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_byte v1, off, s[8:11], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_usubo_clamp_bit:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: s_cmp_eq_u32 s0, s1
; VI-NEXT: v_sub_u32_e32 v0, vcc, s0, v0
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_cbranch_scc1 .LBB8_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: s_xor_b64 s[0:1], vcc, -1
; VI-NEXT: .LBB8_2: ; %exit
; VI-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: flat_store_dword v[1:2], v0
; VI-NEXT: flat_store_byte v[3:4], v5
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: s_usubo_clamp_bit:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s1
; GFX9-NEXT: s_cmp_eq_u32 s0, s1
; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
; GFX9-NEXT: s_mov_b64 s[0:1], 0
; GFX9-NEXT: s_cbranch_scc1 .LBB8_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1
; GFX9-NEXT: .LBB8_2: ; %exit
; GFX9-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v1, v0, s[4:5]
; GFX9-NEXT: global_store_byte v1, v2, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
%c2 = icmp eq i1 %carry, false
%cc = icmp eq i32 %a, %b
br i1 %cc, label %exit, label %if
if:
br label %exit
exit:
%cout = phi i1 [false, %entry], [%c2, %if]
store i32 %val, ptr addrspace(1) %out, align 4
store i1 %cout, ptr addrspace(1) %carryout
ret void
}
define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %a.ptr, ptr addrspace(1) %b.ptr) #0 {
; SI-LABEL: v_usubo_clamp_bit:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_mov_b32 s14, s2
; SI-NEXT: s_mov_b32 s15, s3
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s8
; SI-NEXT: s_mov_b32 s1, s9
; SI-NEXT: s_mov_b32 s12, s10
; SI-NEXT: s_mov_b32 s13, s11
; SI-NEXT: buffer_load_dword v1, off, s[0:3], 0
; SI-NEXT: buffer_load_dword v2, off, s[12:15], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_sub_i32_e64 v0, s[0:1], v1, v2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; SI-NEXT: s_mov_b64 s[8:9], 0
; SI-NEXT: s_cbranch_vccnz .LBB9_2
; SI-NEXT: ; %bb.1: ; %if
; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1
; SI-NEXT: .LBB9_2: ; %exit
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9]
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_usubo_clamp_bit:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; VI-NEXT: s_mov_b64 s[2:3], 0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s8
; VI-NEXT: v_mov_b32_e32 v1, s9
; VI-NEXT: v_mov_b32_e32 v2, s10
; VI-NEXT: v_mov_b32_e32 v3, s11
; VI-NEXT: flat_load_dword v1, v[0:1]
; VI-NEXT: flat_load_dword v2, v[2:3]
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; VI-NEXT: v_sub_u32_e64 v0, s[0:1], v1, v2
; VI-NEXT: s_cbranch_vccnz .LBB9_2
; VI-NEXT: ; %bb.1: ; %if
; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1
; VI-NEXT: .LBB9_2: ; %exit
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_mov_b32_e32 v3, s6
; VI-NEXT: v_mov_b32_e32 v4, s7
; VI-NEXT: flat_store_dword v[1:2], v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3]
; VI-NEXT: flat_store_byte v[3:4], v0
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: v_usubo_clamp_bit:
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dword v2, v0, s[8:9]
; GFX9-NEXT: global_load_dword v3, v0, s[10:11]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX9-NEXT: v_sub_co_u32_e64 v1, s[0:1], v2, v3
; GFX9-NEXT: s_cbranch_vccnz .LBB9_2
; GFX9-NEXT: ; %bb.1: ; %if
; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1
; GFX9-NEXT: .LBB9_2: ; %exit
; GFX9-NEXT: global_store_dword v0, v1, s[4:5]
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; GFX9-NEXT: global_store_byte v0, v1, s[6:7]
; GFX9-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64
%a.gep = getelementptr inbounds i32, ptr addrspace(1) %a.ptr
%b.gep = getelementptr inbounds i32, ptr addrspace(1) %b.ptr
%a = load i32, ptr addrspace(1) %a.gep, align 4
%b = load i32, ptr addrspace(1) %b.gep, align 4
%usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
%val = extractvalue { i32, i1 } %usub, 0
%carry = extractvalue { i32, i1 } %usub, 1
%c2 = icmp eq i1 %carry, false
%cc = icmp eq i32 %a, %b
br i1 %cc, label %exit, label %if
if:
br label %exit
exit:
%cout = phi i1 [false, %entry], [%c2, %if]
store i32 %val, ptr addrspace(1) %out, align 4
store i1 %cout, ptr addrspace(1) %carryout
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare { i16, i1 } @llvm.usub.with.overflow.i16(i16, i16) #1
declare { i32, i1 } @llvm.usub.with.overflow.i32(i32, i32) #1
declare { i64, i1 } @llvm.usub.with.overflow.i64(i64, i64) #1
declare { <2 x i32>, <2 x i1> } @llvm.usub.with.overflow.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }