Files
clang-p2996/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
Fabian Ritter 92a06546ab [LowerMemIntrinsics] Lower llvm.memmove to wide memory accesses (#100122)
So far, the IR-level lowering of llvm.memmove intrinsics generates loops
that copy each byte individually. This can be wasteful for targets that
provide wider memory access operations.

This patch makes the memmove lowering more similar to the lowering of
memcpy with unknown length.
TargetTransformInfo::getMemcpyLoopLoweringType() is queried for an
adequate type for the memory accesses, and if it is wider than a single
byte, the greatest multiple of the type's size that is less than or
equal to the length is copied with corresponding wide memory accesses. A
residual loop with byte-wise accesses (or a sequence of suitable memory
accesses in case the length is statically known) is introduced for the
remaining bytes.

For memmove, this construct is required in two variants: one for copying
forward and one for copying backwards, to handle overlapping memory
ranges. For the backwards case, the residual code still covers the bytes
at the end of the copied region and is therefore executed before the
wide main loop. This implementation choice is based on the assumption
that we are more likely to encounter memory ranges whose start aligns
with the access width than ones whose end does.

In microbenchmarks on gfx1030 (AMDGPU), this change yields speedups up
to 16x for memmoves with variable or large constant lengths.

Part of SWDEV-455845.
2024-07-26 08:43:30 +02:00

100 lines
4.7 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
; RUN: llc -global-isel -mtriple=amdgcn-- -verify-machineinstrs -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src) {
; LOOP-LABEL: memmove_p1i8:
; LOOP: ; %bb.0:
; LOOP-NEXT: v_cmp_ge_u64_e32 vcc, v[2:3], v[0:1]
; LOOP-NEXT: s_and_saveexec_b64 s[0:1], vcc
; LOOP-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; LOOP-NEXT: s_cbranch_execnz .LBB0_3
; LOOP-NEXT: ; %bb.1: ; %Flow
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; LOOP-NEXT: s_cbranch_execnz .LBB0_4
; LOOP-NEXT: .LBB0_2: ; %memmove_done
; LOOP-NEXT: s_endpgm
; LOOP-NEXT: .LBB0_3:
; LOOP-NEXT: s_mov_b32 s6, 0
; LOOP-NEXT: s_mov_b32 s7, 0xf000
; LOOP-NEXT: s_mov_b64 s[4:5], 0
; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[4:7], 0 addr64 offset:1
; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[4:7], 0 addr64 offset:3
; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[4:7], 0 addr64 offset:2
; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[4:7], 0 addr64
; LOOP-NEXT: s_waitcnt vmcnt(3)
; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; LOOP-NEXT: s_waitcnt vmcnt(2)
; LOOP-NEXT: v_lshlrev_b32_e32 v4, 24, v5
; LOOP-NEXT: s_waitcnt vmcnt(1)
; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; LOOP-NEXT: s_waitcnt vmcnt(0)
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[4:7], 0 addr64 offset:1
; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[4:7], 0 addr64 offset:2
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:3
; LOOP-NEXT: ; implicit-def: $vgpr2_vgpr3
; LOOP-NEXT: ; implicit-def: $vgpr0_vgpr1
; LOOP-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; LOOP-NEXT: s_cbranch_execz .LBB0_2
; LOOP-NEXT: .LBB0_4: ; %memmove_bwd_residual
; LOOP-NEXT: s_mov_b32 s2, 0
; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: s_mov_b64 s[0:1], 0
; LOOP-NEXT: s_waitcnt expcnt(2)
; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:3
; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64
; LOOP-NEXT: s_waitcnt vmcnt(3)
; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v4
; LOOP-NEXT: s_waitcnt vmcnt(2)
; LOOP-NEXT: v_lshlrev_b32_e32 v4, 24, v5
; LOOP-NEXT: s_waitcnt vmcnt(1)
; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; LOOP-NEXT: s_waitcnt vmcnt(0)
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:1
; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:2
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
; LOOP-NEXT: s_endpgm
;
; UNROLL-LABEL: memmove_p1i8:
; UNROLL: ; %bb.0:
; UNROLL-NEXT: s_mov_b32 s2, 0
; UNROLL-NEXT: s_mov_b32 s3, 0xf000
; UNROLL-NEXT: s_mov_b64 s[0:1], 0
; UNROLL-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
; UNROLL-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:1
; UNROLL-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
; UNROLL-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:3
; UNROLL-NEXT: s_waitcnt vmcnt(3)
; UNROLL-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
; UNROLL-NEXT: s_waitcnt vmcnt(3)
; UNROLL-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:1
; UNROLL-NEXT: s_waitcnt vmcnt(3)
; UNROLL-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:2
; UNROLL-NEXT: s_waitcnt vmcnt(3)
; UNROLL-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:3
; UNROLL-NEXT: s_endpgm
call void @llvm.memmove.p1.p1.i32(ptr addrspace(1) %dst, ptr addrspace(1) %src, i32 4, i1 false)
ret void
}