In the SelectionDAG lowering of the memcpy intrinsic, this optimization introduces additional chains between fixed-size groups of loads and the corresponding stores. While initially introduced to ensure that wider load/store-pair instructions are generated on AArch64, this optimization also improves code generation for AMDGPU: Ganged loads are scheduled into a clause; stores only await completion of their corresponding load. The chosen value of 16 performed good in microbenchmarks, values of 8, 32, or 64 would perform similarly. The testcase updates are autogenerated by utils/update_llc_test_checks.py. See also: - PR introducing this optimization: https://reviews.llvm.org/D46477 Part of SWDEV-455845.
60 lines
3.0 KiB
LLVM
60 lines
3.0 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=MUBUF
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+enable-flat-scratch < %s | FileCheck %s -check-prefix=FLATSCR
|
|
|
|
; Make sure there's no assertion from passing a 0 alignment value
|
|
define void @memcpy_fixed_align(ptr addrspace(5) %dst, ptr addrspace(1) %src) {
|
|
; MUBUF-LABEL: memcpy_fixed_align:
|
|
; MUBUF: ; %bb.0:
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; MUBUF-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
|
|
; MUBUF-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
|
|
; MUBUF-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
|
|
; MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(2)
|
|
; MUBUF-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:32
|
|
; MUBUF-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(3)
|
|
; MUBUF-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:12
|
|
; MUBUF-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:8
|
|
; MUBUF-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4
|
|
; MUBUF-NEXT: buffer_store_dword v3, off, s[0:3], s32
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(6)
|
|
; MUBUF-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:28
|
|
; MUBUF-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24
|
|
; MUBUF-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20
|
|
; MUBUF-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:16
|
|
; MUBUF-NEXT: ;;#ASMSTART
|
|
; MUBUF-NEXT: ; use v0
|
|
; MUBUF-NEXT: ;;#ASMEND
|
|
; MUBUF-NEXT: s_waitcnt vmcnt(0)
|
|
; MUBUF-NEXT: s_setpc_b64 s[30:31]
|
|
;
|
|
; FLATSCR-LABEL: memcpy_fixed_align:
|
|
; FLATSCR: ; %bb.0:
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
|
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off
|
|
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off offset:16
|
|
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
|
|
; FLATSCR-NEXT: v_mov_b32_e32 v0, s32
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
|
|
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
|
|
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32 offset:16
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
|
|
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
|
|
; FLATSCR-NEXT: ;;#ASMSTART
|
|
; FLATSCR-NEXT: ; use v0
|
|
; FLATSCR-NEXT: ;;#ASMEND
|
|
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
|
|
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
|
|
%alloca = alloca [40 x i8], addrspace(5)
|
|
call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 dereferenceable(40) %alloca, ptr addrspace(1) align 4 dereferenceable(40) %src, i64 40, i1 false)
|
|
call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca) #0
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #0
|
|
|
|
attributes #0 = { argmemonly nounwind willreturn }
|