So far, the IR-level lowering of llvm.memmove intrinsics generates loops that copy each byte individually. This can be wasteful for targets that provide wider memory access operations. This patch makes the memmove lowering more similar to the lowering of memcpy with unknown length. TargetTransformInfo::getMemcpyLoopLoweringType() is queried for an adequate type for the memory accesses, and if it is wider than a single byte, the greatest multiple of the type's size that is less than or equal to the length is copied with corresponding wide memory accesses. A residual loop with byte-wise accesses (or a sequence of suitable memory accesses in case the length is statically known) is introduced for the remaining bytes. For memmove, this construct is required in two variants: one for copying forward and one for copying backwards, to handle overlapping memory ranges. For the backwards case, the residual code still covers the bytes at the end of the copied region and is therefore executed before the wide main loop. This implementation choice is based on the assumption that we are more likely to encounter memory ranges whose start aligns with the access width than ones whose end does. In microbenchmarks on gfx1030 (AMDGPU), this change yields speedups up to 16x for memmoves with variable or large constant lengths. Part of SWDEV-455845.
100 lines
4.7 KiB
LLVM
100 lines
4.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
|
|
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
|
|
|
|
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
|
|
|
|
define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
|
|
; LOOP-LABEL: memmove_p1i8:
|
|
; LOOP: ; %bb.0:
|
|
; LOOP-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
|
|
; LOOP-NEXT: s_and_saveexec_b64 s[0:1], vcc
|
|
; LOOP-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
|
|
; LOOP-NEXT: s_cbranch_execnz .LBB0_3
|
|
; LOOP-NEXT: ; %bb.1: ; %Flow
|
|
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; LOOP-NEXT: s_cbranch_execnz .LBB0_4
|
|
; LOOP-NEXT: .LBB0_2: ; %memmove_done
|
|
; LOOP-NEXT: s_endpgm
|
|
; LOOP-NEXT: .LBB0_3:
|
|
; LOOP-NEXT: s_mov_b32 s6, 0
|
|
; LOOP-NEXT: s_mov_b32 s7, 0xf000
|
|
; LOOP-NEXT: s_mov_b64 s[4:5], 0
|
|
; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[4:7], 0 addr64 offset:1
|
|
; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 offset:3
|
|
; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[4:7], 0 addr64 offset:2
|
|
; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[4:7], 0 addr64
|
|
; LOOP-NEXT: s_waitcnt vmcnt(3)
|
|
; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
|
; LOOP-NEXT: s_waitcnt vmcnt(2)
|
|
; LOOP-NEXT: v_lshlrev_b32_e32 v4, 24, v5
|
|
; LOOP-NEXT: s_waitcnt vmcnt(1)
|
|
; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v6
|
|
; LOOP-NEXT: s_waitcnt vmcnt(0)
|
|
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
|
|
; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
|
|
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
|
|
; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
|
|
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
|
|
; LOOP-NEXT: s_waitcnt expcnt(0)
|
|
; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
|
|
; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[4:7], 0 addr64 offset:1
|
|
; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[4:7], 0 addr64 offset:2
|
|
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3
|
|
; LOOP-NEXT: ; implicit-def: $vgpr2_vgpr3
|
|
; LOOP-NEXT: ; implicit-def: $vgpr0_vgpr1
|
|
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
|
|
; LOOP-NEXT: s_cbranch_execz .LBB0_2
|
|
; LOOP-NEXT: .LBB0_4: ; %memmove_bwd_residual
|
|
; LOOP-NEXT: s_mov_b32 s2, 0
|
|
; LOOP-NEXT: s_mov_b32 s3, 0xf000
|
|
; LOOP-NEXT: s_mov_b64 s[0:1], 0
|
|
; LOOP-NEXT: s_waitcnt expcnt(2)
|
|
; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
|
|
; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:3
|
|
; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
|
|
; LOOP-NEXT: s_waitcnt expcnt(0)
|
|
; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64
|
|
; LOOP-NEXT: s_waitcnt vmcnt(3)
|
|
; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4
|
|
; LOOP-NEXT: s_waitcnt vmcnt(2)
|
|
; LOOP-NEXT: v_lshlrev_b32_e32 v4, 24, v5
|
|
; LOOP-NEXT: s_waitcnt vmcnt(1)
|
|
; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v6
|
|
; LOOP-NEXT: s_waitcnt vmcnt(0)
|
|
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
|
|
; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
|
|
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
|
|
; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
|
|
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
|
|
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
|
|
; LOOP-NEXT: s_waitcnt expcnt(0)
|
|
; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
|
|
; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
|
|
; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2
|
|
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
|
|
; LOOP-NEXT: s_endpgm
|
|
;
|
|
; UNROLL-LABEL: memmove_p1i8:
|
|
; UNROLL: ; %bb.0:
|
|
; UNROLL-NEXT: s_mov_b32 s2, 0
|
|
; UNROLL-NEXT: s_mov_b32 s3, 0xf000
|
|
; UNROLL-NEXT: s_mov_b64 s[0:1], 0
|
|
; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
|
|
; UNROLL-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:1
|
|
; UNROLL-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
|
|
; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:3
|
|
; UNROLL-NEXT: s_waitcnt vmcnt(3)
|
|
; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
|
|
; UNROLL-NEXT: s_waitcnt vmcnt(3)
|
|
; UNROLL-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:1
|
|
; UNROLL-NEXT: s_waitcnt vmcnt(3)
|
|
; UNROLL-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:2
|
|
; UNROLL-NEXT: s_waitcnt vmcnt(3)
|
|
; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
|
|
; UNROLL-NEXT: s_endpgm
|
|
call void @llvm.memmove.p1.p1.i32(ptr addrspace(1) %dst, ptr addrspace(1) %src, i32 4, i1 false)
|
|
ret void
|
|
}
|