This patch dismantles G_SHUFFLE_VECTOR before lowering. The original lowering would emit extract vector element ops. We found that by using unmerged values the build vector op combine could find ways to fold. Only enabled on AMDGPU. This resolves #123631
19 lines
852 B
LLVM
19 lines
852 B
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -global-isel -march=amdgcn -mtriple=amdgcn-amd-hmcsa -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX942 %s
|
|
|
|
define void @shuffle_to_extract(ptr addrspace(3) %in, ptr addrspace(3) %out) {
|
|
; GFX942-LABEL: shuffle_to_extract:
|
|
; GFX942: ; %bb.0:
|
|
; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; GFX942-NEXT: ds_read2_b64 v[2:5], v0 offset1:1
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: ds_write_b64 v1, v[4:5]
|
|
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
|
|
; GFX942-NEXT: s_setpc_b64 s[30:31]
|
|
%val = load <8 x half>, ptr addrspace(3) %in, align 8
|
|
%res = shufflevector <8 x half> %val, <8 x half> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
|
|
store <4 x half> %res, ptr addrspace(3) %out, align 8
|
|
ret void
|
|
}
|
|
|