Files
clang-p2996/llvm/test/CodeGen/AMDGPU/sign_extend.ll
Stanislav Mekhanoshin 555d8f4ef5 [AMDGPU] Bundle loads before post-RA scheduler
We are relying on atrificial DAG edges inserted by the
MemOpClusterMutation to keep loads and stores together in the
post-RA scheduler. This does not work all the time since it
allows to schedule a completely independent instruction in the
middle of the cluster.

Removed the DAG mutation and added pass to bundle already
clustered instructions. These bundles are unpacked before the
memory legalizer because it does not work with bundles but also
because it allows to insert waitcounts in the middle of a store
cluster.

Removing artificial edges also allows a more relaxed scheduling.

Differential Revision: https://reviews.llvm.org/D72737
2020-01-24 11:33:38 -08:00

585 lines
21 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,SI
; RUN: llc -mtriple=amdgcn-- -amdgpu-scalarize-global-loads=false -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -allow-deprecated-dag-overlap -enable-var-scope -check-prefixes=GCN,VI
define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
; SI-LABEL: s_sext_i1_to_i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_i1_to_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%cmp = icmp eq i32 %a, %b
%sext = sext i1 %cmp to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
; SI-LABEL: test_s_sext_i32_to_i64:
; SI: ; %bb.0: ; %entry
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mul_i32 s0, s0, s1
; SI-NEXT: s_add_i32 s0, s0, s2
; SI-NEXT: s_ashr_i32 s1, s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_s_sext_i32_to_i64:
; VI: ; %bb.0: ; %entry
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mul_i32 s0, s0, s1
; VI-NEXT: s_add_i32 s0, s0, s2
; VI-NEXT: s_ashr_i32 s1, s0, 31
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
entry:
%mul = mul i32 %a, %b
%add = add i32 %mul, %c
%sext = sext i32 %add to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
; SI-LABEL: s_sext_i1_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: v_mov_b32_e32 v1, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_i1_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%cmp = icmp eq i32 %a, %b
%sext = sext i1 %cmp to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
; SI-LABEL: s_sext_i32_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_ashr_i32 s1, s0, 31
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_i32_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s1, s0, 31
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i32 %a to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
; SI-LABEL: v_sext_i32_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_sext_i32_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; VI-NEXT: s_endpgm
%val = load i32, i32 addrspace(1)* %in, align 4
%sext = sext i32 %val to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
; SI-LABEL: s_sext_i16_to_i64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_i16_to_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x100000
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
%sext = sext i16 %a to i64
store i64 %sext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
; SI-LABEL: s_sext_i1_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_i1_to_i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%cmp = icmp eq i32 %a, %b
%sext = sext i1 %cmp to i16
store i16 %sext, i16 addrspace(1)* %out
ret void
}
; This purpose of this test is to make sure the i16 = sign_extend i1 node
; makes it all the way throught the legalizer/optimizer to make sure
; we select this correctly. In the s_sext_i1_to_i16, the sign_extend node
; is optimized to a select very early.
define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
; SI-LABEL: s_sext_i1_to_i16_with_and:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_i1_to_i16_with_and:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v1
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%cmp0 = icmp eq i32 %a, %b
%cmp1 = icmp eq i32 %c, %d
%cmp = and i1 %cmp0, %cmp1
%sext = sext i1 %cmp to i16
store i16 %sext, i16 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
; SI-LABEL: v_sext_i1_to_i16_with_and:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_sext_i1_to_i16_with_and:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0
; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], s1, v1
; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1]
; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
%cmp0 = icmp eq i32 %a, %tid
%cmp1 = icmp eq i32 %b, %c
%cmp = and i1 %cmp0, %cmp1
%sext = sext i1 %cmp to i16
store i16 %sext, i16 addrspace(1)* %out
ret void
}
; FIXME: We end up with a v_bfe instruction, because the i16 srl
; gets selected to a v_lshrrev_b16 instructions, so the input to
; the bfe is a vector registers. To fix this we need to be able to
; optimize:
; t29: i16 = truncate t10
; t55: i16 = srl t29, Constant:i32<8>
; t63: i32 = any_extend t55
; t64: i32 = sign_extend_inreg t63, ValueType:ch:i8
define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
; SI-LABEL: s_sext_v4i8_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
; SI-NEXT: s_load_dword s0, s[0:1], 0xb
; SI-NEXT: s_mov_b32 s7, 0xf000
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_ashr_i32 s1, s0, 24
; SI-NEXT: s_bfe_i32 s2, s0, 0x80010
; SI-NEXT: s_bfe_i32 s3, s0, 0x80008
; SI-NEXT: s_sext_i32_i8 s0, s0
; SI-NEXT: v_mov_b32_e32 v0, s0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s3
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s2
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s1
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
; VI-NEXT: s_load_dword s0, s[0:1], 0x2c
; VI-NEXT: s_mov_b32 s7, 0xf000
; VI-NEXT: s_mov_b32 s6, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_lshrrev_b16_e64 v0, 8, s0
; VI-NEXT: s_ashr_i32 s1, s0, 24
; VI-NEXT: s_bfe_i32 s2, s0, 0x80010
; VI-NEXT: s_sext_i32_i8 s0, s0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_mov_b32_e32 v1, s0
; VI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: v_mov_b32_e32 v0, s1
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%cast = bitcast i32 %a to <4 x i8>
%ext = sext <4 x i8> %cast to <4 x i32>
%elt0 = extractelement <4 x i32> %ext, i32 0
%elt1 = extractelement <4 x i32> %ext, i32 1
%elt2 = extractelement <4 x i32> %ext, i32 2
%elt3 = extractelement <4 x i32> %ext, i32 3
store volatile i32 %elt0, i32 addrspace(1)* %out
store volatile i32 %elt1, i32 addrspace(1)* %out
store volatile i32 %elt2, i32 addrspace(1)* %out
store volatile i32 %elt3, i32 addrspace(1)* %out
ret void
}
; FIXME: need to optimize same sequence as above test to avoid
; this shift.
define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
; SI-LABEL: v_sext_v4i8_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashrrev_i32_e32 v1, 24, v0
; SI-NEXT: v_bfe_i32 v2, v0, 16, 8
; SI-NEXT: v_bfe_i32 v3, v0, 8, 8
; SI-NEXT: v_bfe_i32 v0, v0, 0, 8
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_sext_v4i8_to_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_ashrrev_i32_e32 v2, 24, v0
; VI-NEXT: v_bfe_i32 v3, v0, 16, 8
; VI-NEXT: v_bfe_i32 v0, v0, 0, 8
; VI-NEXT: v_bfe_i32 v1, v1, 0, 8
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_endpgm
%a = load i32, i32 addrspace(1)* %in
%cast = bitcast i32 %a to <4 x i8>
%ext = sext <4 x i8> %cast to <4 x i32>
%elt0 = extractelement <4 x i32> %ext, i32 0
%elt1 = extractelement <4 x i32> %ext, i32 1
%elt2 = extractelement <4 x i32> %ext, i32 2
%elt3 = extractelement <4 x i32> %ext, i32 3
store volatile i32 %elt0, i32 addrspace(1)* %out
store volatile i32 %elt1, i32 addrspace(1)* %out
store volatile i32 %elt2, i32 addrspace(1)* %out
store volatile i32 %elt3, i32 addrspace(1)* %out
ret void
}
; FIXME: s_bfe_i64, same on SI and VI
define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
; SI-LABEL: s_sext_v4i16_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_ashr_i64 s[4:5], s[6:7], 48
; SI-NEXT: s_ashr_i32 s5, s6, 16
; SI-NEXT: s_sext_i32_i16 s6, s6
; SI-NEXT: v_mov_b32_e32 v0, s6
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s5
; SI-NEXT: s_sext_i32_i16 s7, s7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s7
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_waitcnt expcnt(0)
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: s_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_ashr_i32 s5, s6, 16
; VI-NEXT: s_sext_i32_i16 s6, s6
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: v_mov_b32_e32 v0, s6
; VI-NEXT: s_ashr_i32 s4, s7, 16
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: v_mov_b32_e32 v0, s5
; VI-NEXT: s_sext_i32_i16 s7, s7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: v_mov_b32_e32 v0, s7
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: s_endpgm
%cast = bitcast i64 %a to <4 x i16>
%ext = sext <4 x i16> %cast to <4 x i32>
%elt0 = extractelement <4 x i32> %ext, i32 0
%elt1 = extractelement <4 x i32> %ext, i32 1
%elt2 = extractelement <4 x i32> %ext, i32 2
%elt3 = extractelement <4 x i32> %ext, i32 3
store volatile i32 %elt0, i32 addrspace(1)* %out
store volatile i32 %elt1, i32 addrspace(1)* %out
store volatile i32 %elt2, i32 addrspace(1)* %out
store volatile i32 %elt3, i32 addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
; SI-LABEL: v_sext_v4i16_to_v4i32:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s0, s4
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: s_mov_b32 s4, s6
; SI-NEXT: s_mov_b32 s5, s7
; SI-NEXT: s_mov_b32 s6, s2
; SI-NEXT: s_mov_b32 s7, s3
; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_ashr_i64 v[2:3], v[0:1], 48
; SI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; SI-NEXT: v_bfe_i32 v0, v0, 0, 16
; SI-NEXT: v_bfe_i32 v1, v1, 0, 16
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: v_sext_v4i16_to_v4i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; VI-NEXT: s_mov_b32 s3, 0xf000
; VI-NEXT: s_mov_b32 s2, -1
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_mov_b32 s0, s4
; VI-NEXT: s_mov_b32 s1, s5
; VI-NEXT: s_mov_b32 s4, s6
; VI-NEXT: s_mov_b32 s5, s7
; VI-NEXT: s_mov_b32 s6, s2
; VI-NEXT: s_mov_b32 s7, s3
; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_ashrrev_i32_e32 v3, 16, v0
; VI-NEXT: v_bfe_i32 v0, v0, 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v2, 16, v1
; VI-NEXT: v_bfe_i32 v1, v1, 0, 16
; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v3, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v1, off, s[0:3], 0
; VI-NEXT: buffer_store_dword v2, off, s[0:3], 0
; VI-NEXT: s_endpgm
%a = load i64, i64 addrspace(1)* %in
%cast = bitcast i64 %a to <4 x i16>
%ext = sext <4 x i16> %cast to <4 x i32>
%elt0 = extractelement <4 x i32> %ext, i32 0
%elt1 = extractelement <4 x i32> %ext, i32 1
%elt2 = extractelement <4 x i32> %ext, i32 2
%elt3 = extractelement <4 x i32> %ext, i32 3
store volatile i32 %elt0, i32 addrspace(1)* %out
store volatile i32 %elt1, i32 addrspace(1)* %out
store volatile i32 %elt2, i32 addrspace(1)* %out
store volatile i32 %elt3, i32 addrspace(1)* %out
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #1
attributes #1 = { nounwind readnone }