We had some instances when LLVM would not inline fixed-count memcpy and ended up attempting to lower it a a libcall, which would not work on AMDGPU as the address space doesn't meet the requirement, causing compiler crash. The patch relaxes the threshold used for -Os/-Oz compilation so we're always allowed to inline memory copy functions. This patch basically does the same thing as https://reviews.llvm.org/D158226 for AMDGPU. Fix #88497.
2697 lines
135 KiB
LLVM
2697 lines
135 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
|
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 %s -o - | FileCheck %s
|
|
|
|
%struct.S = type { [32 x i32] }
|
|
|
|
@shared = addrspace(3) global %struct.S undef, align 4
|
|
|
|
define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0 {
|
|
; CHECK-LABEL: memcpy_p0_p0_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s2
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s3
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
|
|
; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p1_p1_minsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #0 {
|
|
; CHECK-LABEL: memcpy_p1_p1_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
|
|
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p1_p4_minsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #0 {
|
|
; CHECK-LABEL: memcpy_p1_p4_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
|
|
; CHECK-LABEL: memcpy_p5_p4_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s7
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s2
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
|
|
; CHECK-NEXT: s_waitcnt vmcnt(18)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
|
|
; CHECK-NEXT: s_waitcnt vmcnt(19)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
|
|
; CHECK-NEXT: s_waitcnt vmcnt(20)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
|
|
; CHECK-NEXT: s_waitcnt vmcnt(21)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
|
|
; CHECK-NEXT: s_waitcnt vmcnt(22)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
|
|
; CHECK-NEXT: s_waitcnt vmcnt(23)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
|
|
; CHECK-NEXT: s_waitcnt vmcnt(24)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
|
|
; CHECK-NEXT: s_waitcnt vmcnt(25)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
|
|
; CHECK-NEXT: s_waitcnt vmcnt(27)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
|
|
; CHECK-NEXT: s_waitcnt vmcnt(28)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
|
|
; CHECK-NEXT: s_waitcnt vmcnt(29)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
|
|
; CHECK-NEXT: s_waitcnt vmcnt(30)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
|
|
; CHECK-NEXT: s_waitcnt vmcnt(31)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(32)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
|
|
; CHECK-NEXT: s_waitcnt vmcnt(33)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
|
|
; CHECK-NEXT: s_waitcnt vmcnt(34)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
|
|
; CHECK-NEXT: s_waitcnt vmcnt(34)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
|
|
; CHECK-NEXT: s_waitcnt vmcnt(33)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
|
|
; CHECK-NEXT: s_waitcnt vmcnt(32)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
|
|
; CHECK-NEXT: s_waitcnt vmcnt(31)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
|
|
; CHECK-NEXT: s_waitcnt vmcnt(30)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
|
|
; CHECK-NEXT: s_waitcnt vmcnt(29)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(28)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
|
|
; CHECK-NEXT: s_waitcnt vmcnt(27)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
|
|
; CHECK-NEXT: s_waitcnt vmcnt(25)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
|
|
; CHECK-NEXT: s_waitcnt vmcnt(25)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
|
|
; CHECK-NEXT: s_waitcnt vmcnt(24)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
|
|
; CHECK-NEXT: s_waitcnt vmcnt(23)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
|
|
; CHECK-NEXT: s_waitcnt vmcnt(22)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
|
|
; CHECK-NEXT: s_waitcnt vmcnt(21)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
|
|
; CHECK-NEXT: s_waitcnt vmcnt(20)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
|
|
; CHECK-NEXT: s_waitcnt vmcnt(19)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
|
|
; CHECK-NEXT: s_waitcnt vmcnt(19)
|
|
; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
|
|
; CHECK-LABEL: memcpy_p0_p5_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
|
|
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s7
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s1
|
|
; CHECK-NEXT: s_waitcnt vmcnt(17)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p3_p4_minsize(ptr addrspace(4) %0) #0 {
|
|
; CHECK-LABEL: memcpy_p3_p4_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v24, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
|
|
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
|
|
; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
|
|
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
|
|
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
|
|
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(5)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
|
|
; CHECK-NEXT: s_waitcnt vmcnt(4)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
|
|
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(5)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
|
|
; CHECK-NEXT: s_waitcnt vmcnt(4)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
|
|
; CHECK-NEXT: s_waitcnt vmcnt(3)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
|
|
; CHECK-NEXT: s_waitcnt vmcnt(1)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
|
|
; CHECK-LABEL: memcpy_p0_p3_minsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s1
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
|
|
; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
|
|
; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
|
|
; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
|
|
; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
|
|
; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
|
|
; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
|
|
; CHECK-NEXT: ds_read_u8 v13, v2
|
|
; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
|
|
; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
|
|
; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
|
|
; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
|
|
; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
|
|
; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
|
|
; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
|
|
; CHECK-LABEL: memcpy_p0_p0_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s2
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s3
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1]
|
|
; CHECK-NEXT: v_mov_b32_e32 v3, s1
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:1
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:1
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:2
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:2
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:3
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:3
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:4
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:4
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:5
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:5
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:6
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:6
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:7
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:7
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:8
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:8
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:9
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:9
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:10
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:10
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:11
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:11
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:12
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:12
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:13
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:13
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:14
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:14
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:15
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:15
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:16
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:17
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:17
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:18
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:18
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:19
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:19
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:20
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:20
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:21
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:21
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:22
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:22
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:23
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:23
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:24
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:24
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:25
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:25
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:26
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:26
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:27
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:27
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:28
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:28
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:29
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:29
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:30
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:30
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:31
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:31
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:32
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:33
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:33
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:34
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:34
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:35
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:35
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:36
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:36
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:37
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:37
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:38
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:38
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:39
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:39
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:40
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:40
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:41
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:41
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:42
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:42
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:43
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:43
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:44
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:44
|
|
; CHECK-NEXT: flat_load_ubyte v4, v[0:1] offset:45
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v4 offset:45
|
|
; CHECK-NEXT: flat_load_ubyte v0, v[0:1] offset:46
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[2:3], v0 offset:46
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 47, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p1_p1_optsize(ptr addrspace(1) %dest, ptr addrspace(1) %src) #1 {
|
|
; CHECK-LABEL: memcpy_p1_p1_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:32
|
|
; CHECK-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] offset:39
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] offset:39
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dest, ptr addrspace(1) %src, i64 47, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p1_p4_optsize(ptr addrspace(1) %global, ptr addrspace(4) %0) #1 {
|
|
; CHECK-LABEL: memcpy_p1_p4_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v4, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:16
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:64
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:80
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:96
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3] offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:112
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) %global, ptr addrspace(4) %0, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
|
|
; CHECK-LABEL: memcpy_p5_p4_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
|
|
; CHECK-NEXT: s_load_dword s2, s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, 0
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s7
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1]
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:2
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:3
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:4
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:6
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:7
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:8
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:9
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:10
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:11
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:12
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:13
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:14
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:15
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s2
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:16
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:17
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:18
|
|
; CHECK-NEXT: s_waitcnt vmcnt(18)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:19
|
|
; CHECK-NEXT: s_waitcnt vmcnt(19)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:1
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:20
|
|
; CHECK-NEXT: s_waitcnt vmcnt(20)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:2
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:21
|
|
; CHECK-NEXT: s_waitcnt vmcnt(21)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:3
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:22
|
|
; CHECK-NEXT: s_waitcnt vmcnt(22)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:4
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:23
|
|
; CHECK-NEXT: s_waitcnt vmcnt(23)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:5
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:24
|
|
; CHECK-NEXT: s_waitcnt vmcnt(24)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:6
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:25
|
|
; CHECK-NEXT: s_waitcnt vmcnt(25)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:7
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:26
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:8
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:27
|
|
; CHECK-NEXT: s_waitcnt vmcnt(27)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:9
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:28
|
|
; CHECK-NEXT: s_waitcnt vmcnt(28)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:10
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:29
|
|
; CHECK-NEXT: s_waitcnt vmcnt(29)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:11
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:30
|
|
; CHECK-NEXT: s_waitcnt vmcnt(30)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:12
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:31
|
|
; CHECK-NEXT: s_waitcnt vmcnt(31)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:13
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(32)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:14
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:33
|
|
; CHECK-NEXT: s_waitcnt vmcnt(33)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:15
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:34
|
|
; CHECK-NEXT: s_waitcnt vmcnt(34)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:16
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:35
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:17
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:36
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:18
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:37
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:19
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:38
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:20
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:39
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:21
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:40
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:22
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:41
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:23
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:42
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:24
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:43
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:25
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:44
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:26
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:45
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:27
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:46
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:28
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:47
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:29
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:48
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:30
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:49
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:31
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:50
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:32
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:51
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:33
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:52
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:34
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:53
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:35
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:54
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:36
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:55
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:37
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:56
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:38
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:57
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:39
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:58
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:40
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:59
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:41
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:60
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:42
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:61
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:43
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:62
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:44
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:63
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:45
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:64
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:46
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:65
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:47
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:66
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:48
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:67
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:49
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:68
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:50
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:69
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:51
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:70
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:52
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:71
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:53
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:72
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:54
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:73
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:55
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:74
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:56
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:75
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:57
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:76
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:58
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:77
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:59
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:78
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:60
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:79
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:61
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:80
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:62
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:81
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:63
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:82
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:64
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:83
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:65
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:84
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:66
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:85
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:67
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:86
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:68
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:87
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:69
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:88
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:70
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:71
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:89
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:90
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:72
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:91
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:73
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:92
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:74
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:93
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:75
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:94
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:76
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:95
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:77
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:78
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:97
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:79
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:98
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:80
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:99
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:81
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:100
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:82
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:101
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:83
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:102
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:84
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:103
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:85
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:104
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:86
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:105
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:87
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:106
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:88
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:107
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:89
|
|
; CHECK-NEXT: s_waitcnt vmcnt(35)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:90
|
|
; CHECK-NEXT: s_waitcnt vmcnt(34)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:91
|
|
; CHECK-NEXT: s_waitcnt vmcnt(33)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:92
|
|
; CHECK-NEXT: s_waitcnt vmcnt(32)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:93
|
|
; CHECK-NEXT: s_waitcnt vmcnt(31)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:94
|
|
; CHECK-NEXT: s_waitcnt vmcnt(30)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:95
|
|
; CHECK-NEXT: s_waitcnt vmcnt(29)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:96
|
|
; CHECK-NEXT: s_waitcnt vmcnt(28)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:97
|
|
; CHECK-NEXT: s_waitcnt vmcnt(27)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:98
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:99
|
|
; CHECK-NEXT: s_waitcnt vmcnt(25)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:100
|
|
; CHECK-NEXT: global_load_ubyte v2, v0, s[0:1] offset:108
|
|
; CHECK-NEXT: global_load_ubyte v3, v0, s[0:1] offset:109
|
|
; CHECK-NEXT: global_load_ubyte v4, v0, s[0:1] offset:110
|
|
; CHECK-NEXT: global_load_ubyte v5, v0, s[0:1] offset:111
|
|
; CHECK-NEXT: global_load_ubyte v6, v0, s[0:1] offset:112
|
|
; CHECK-NEXT: global_load_ubyte v7, v0, s[0:1] offset:113
|
|
; CHECK-NEXT: global_load_ubyte v15, v0, s[0:1] offset:114
|
|
; CHECK-NEXT: global_load_ubyte v16, v0, s[0:1] offset:115
|
|
; CHECK-NEXT: global_load_ubyte v17, v0, s[0:1] offset:116
|
|
; CHECK-NEXT: global_load_ubyte v18, v0, s[0:1] offset:117
|
|
; CHECK-NEXT: global_load_ubyte v19, v0, s[0:1] offset:118
|
|
; CHECK-NEXT: global_load_ubyte v20, v0, s[0:1] offset:119
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:101
|
|
; CHECK-NEXT: global_load_ubyte v8, v0, s[0:1] offset:120
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:102
|
|
; CHECK-NEXT: global_load_ubyte v9, v0, s[0:1] offset:121
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:103
|
|
; CHECK-NEXT: global_load_ubyte v10, v0, s[0:1] offset:122
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:104
|
|
; CHECK-NEXT: global_load_ubyte v11, v0, s[0:1] offset:123
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:105
|
|
; CHECK-NEXT: global_load_ubyte v12, v0, s[0:1] offset:124
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:106
|
|
; CHECK-NEXT: global_load_ubyte v13, v0, s[0:1] offset:125
|
|
; CHECK-NEXT: s_waitcnt vmcnt(36)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:107
|
|
; CHECK-NEXT: global_load_ubyte v14, v0, s[0:1] offset:126
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: global_load_ubyte v21, v0, s[0:1] offset:127
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v2, v1, s[8:11], 0 offen offset:108
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v3, v1, s[8:11], 0 offen offset:109
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v4, v1, s[8:11], 0 offen offset:110
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v5, v1, s[8:11], 0 offen offset:111
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v6, v1, s[8:11], 0 offen offset:112
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v7, v1, s[8:11], 0 offen offset:113
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v15, v1, s[8:11], 0 offen offset:114
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v16, v1, s[8:11], 0 offen offset:115
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v17, v1, s[8:11], 0 offen offset:116
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v18, v1, s[8:11], 0 offen offset:117
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v19, v1, s[8:11], 0 offen offset:118
|
|
; CHECK-NEXT: s_waitcnt vmcnt(26)
|
|
; CHECK-NEXT: buffer_store_byte v20, v1, s[8:11], 0 offen offset:119
|
|
; CHECK-NEXT: s_waitcnt vmcnt(25)
|
|
; CHECK-NEXT: buffer_store_byte v8, v1, s[8:11], 0 offen offset:120
|
|
; CHECK-NEXT: s_waitcnt vmcnt(24)
|
|
; CHECK-NEXT: buffer_store_byte v9, v1, s[8:11], 0 offen offset:121
|
|
; CHECK-NEXT: s_waitcnt vmcnt(23)
|
|
; CHECK-NEXT: buffer_store_byte v10, v1, s[8:11], 0 offen offset:122
|
|
; CHECK-NEXT: s_waitcnt vmcnt(22)
|
|
; CHECK-NEXT: buffer_store_byte v11, v1, s[8:11], 0 offen offset:123
|
|
; CHECK-NEXT: s_waitcnt vmcnt(21)
|
|
; CHECK-NEXT: buffer_store_byte v12, v1, s[8:11], 0 offen offset:124
|
|
; CHECK-NEXT: s_waitcnt vmcnt(20)
|
|
; CHECK-NEXT: buffer_store_byte v13, v1, s[8:11], 0 offen offset:125
|
|
; CHECK-NEXT: s_waitcnt vmcnt(19)
|
|
; CHECK-NEXT: buffer_store_byte v14, v1, s[8:11], 0 offen offset:126
|
|
; CHECK-NEXT: s_waitcnt vmcnt(19)
|
|
; CHECK-NEXT: buffer_store_byte v21, v1, s[8:11], 0 offen offset:127
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
|
|
; CHECK-LABEL: memcpy_p0_p5_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_mov_b64 s[10:11], s[2:3]
|
|
; CHECK-NEXT: s_mov_b64 s[8:9], s[0:1]
|
|
; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8
|
|
; CHECK-NEXT: s_add_u32 s8, s8, s7
|
|
; CHECK-NEXT: s_addc_u32 s9, s9, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, s0
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:1
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:2
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:3
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:4
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:5
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:6
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:7
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:8
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:9
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:10
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:11
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:12
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:13
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:14
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:15
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:16
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:17
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s1
|
|
; CHECK-NEXT: s_waitcnt vmcnt(17)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:18
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:1
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:19
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:2
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:20
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:3
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:21
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:4
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:22
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:5
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:23
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:6
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:24
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:7
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:25
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:8
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:26
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:9
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:27
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:10
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:28
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:11
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:29
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:12
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:30
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:13
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:31
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:14
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:32
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:15
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:33
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:16
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:34
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:17
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:35
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:18
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:36
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:19
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:37
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:20
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:38
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:21
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:39
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:22
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:40
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:23
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:41
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:24
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:42
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:25
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:43
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:26
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:44
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:27
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:45
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:28
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:46
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:29
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:47
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:30
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:48
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:31
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:49
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:32
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:50
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:33
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:51
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:34
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:52
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:35
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:53
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:36
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:54
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:37
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:55
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:38
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:56
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:57
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:40
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:58
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:41
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:59
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:42
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:60
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:43
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:61
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:44
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:62
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:45
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:63
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:46
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:64
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:47
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:65
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:48
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:66
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:49
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:67
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:50
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:68
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:51
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:69
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:52
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:70
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:53
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:71
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:54
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:72
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:73
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:74
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:57
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:75
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:58
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:76
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:59
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:77
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:60
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:78
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:61
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:79
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:62
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:80
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:63
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:81
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:64
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:82
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:65
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:83
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:66
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:84
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:67
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:85
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:68
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:86
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:69
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:87
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:70
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:88
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:71
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:89
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:72
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:90
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:73
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:74
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:91
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:92
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:75
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:93
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:76
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:94
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:77
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:95
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:78
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:96
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:79
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:97
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:80
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:98
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:81
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:99
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:82
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:100
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:83
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:101
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:84
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:102
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:85
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:103
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:86
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:104
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:87
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:105
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:88
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:106
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:89
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:107
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:108
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:91
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:92
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:93
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:94
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:95
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:96
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:97
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:98
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:99
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:100
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:101
|
|
; CHECK-NEXT: buffer_load_ubyte v4, v2, s[8:11], 0 offen offset:109
|
|
; CHECK-NEXT: buffer_load_ubyte v5, v2, s[8:11], 0 offen offset:110
|
|
; CHECK-NEXT: buffer_load_ubyte v6, v2, s[8:11], 0 offen offset:111
|
|
; CHECK-NEXT: buffer_load_ubyte v7, v2, s[8:11], 0 offen offset:112
|
|
; CHECK-NEXT: buffer_load_ubyte v8, v2, s[8:11], 0 offen offset:113
|
|
; CHECK-NEXT: buffer_load_ubyte v9, v2, s[8:11], 0 offen offset:114
|
|
; CHECK-NEXT: buffer_load_ubyte v10, v2, s[8:11], 0 offen offset:115
|
|
; CHECK-NEXT: buffer_load_ubyte v11, v2, s[8:11], 0 offen offset:116
|
|
; CHECK-NEXT: buffer_load_ubyte v12, v2, s[8:11], 0 offen offset:117
|
|
; CHECK-NEXT: buffer_load_ubyte v13, v2, s[8:11], 0 offen offset:118
|
|
; CHECK-NEXT: buffer_load_ubyte v14, v2, s[8:11], 0 offen offset:119
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:102
|
|
; CHECK-NEXT: buffer_load_ubyte v15, v2, s[8:11], 0 offen offset:120
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:103
|
|
; CHECK-NEXT: buffer_load_ubyte v16, v2, s[8:11], 0 offen offset:121
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:104
|
|
; CHECK-NEXT: buffer_load_ubyte v17, v2, s[8:11], 0 offen offset:122
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:105
|
|
; CHECK-NEXT: buffer_load_ubyte v18, v2, s[8:11], 0 offen offset:123
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:106
|
|
; CHECK-NEXT: buffer_load_ubyte v19, v2, s[8:11], 0 offen offset:124
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:107
|
|
; CHECK-NEXT: buffer_load_ubyte v20, v2, s[8:11], 0 offen offset:125
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:108
|
|
; CHECK-NEXT: buffer_load_ubyte v3, v2, s[8:11], 0 offen offset:126
|
|
; CHECK-NEXT: s_nop 0
|
|
; CHECK-NEXT: buffer_load_ubyte v21, v2, s[8:11], 0 offen offset:127
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:109
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:110
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:111
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:112
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:113
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:114
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:115
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:116
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:117
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13 offset:118
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:119
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:120
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:121
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:122
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:123
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:124
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v20 offset:125
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:126
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v21 offset:127
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p0.p5.i64(ptr %generic, ptr addrspace(5) %src, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p3_p4_optsize(ptr addrspace(4) %0) #1 {
|
|
; CHECK-LABEL: memcpy_p3_p4_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v24, 0
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
|
|
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
|
|
; CHECK-NEXT: global_load_dwordx4 v[8:11], v24, s[0:1] offset:80
|
|
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
|
|
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
|
|
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
|
|
; CHECK-NEXT: s_waitcnt vmcnt(5)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:14 offset1:15
|
|
; CHECK-NEXT: s_waitcnt vmcnt(4)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset0:12 offset1:13
|
|
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
|
|
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
|
|
; CHECK-NEXT: s_waitcnt vmcnt(5)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[8:9], v[10:11] offset0:10 offset1:11
|
|
; CHECK-NEXT: s_waitcnt vmcnt(4)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[12:13], v[14:15] offset0:8 offset1:9
|
|
; CHECK-NEXT: s_waitcnt vmcnt(3)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[16:17], v[18:19] offset0:6 offset1:7
|
|
; CHECK-NEXT: s_waitcnt vmcnt(2)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[20:21], v[22:23] offset0:4 offset1:5
|
|
; CHECK-NEXT: s_waitcnt vmcnt(1)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[0:1], v[2:3] offset0:2 offset1:3
|
|
; CHECK-NEXT: s_waitcnt vmcnt(0)
|
|
; CHECK-NEXT: ds_write2_b64 v24, v[4:5], v[6:7] offset1:1
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) @shared, ptr addrspace(4) %0, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
|
|
; CHECK-LABEL: memcpy_p0_p3_optsize:
|
|
; CHECK: ; %bb.0: ; %entry
|
|
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
|
|
; CHECK-NEXT: v_mov_b32_e32 v2, 0
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:127
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:126
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:125
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:124
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: v_mov_b32_e32 v0, s0
|
|
; CHECK-NEXT: v_mov_b32_e32 v1, s1
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:127
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:126
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:123
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:125
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:124
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:122
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:121
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:123
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:120
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:119
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:122
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:121
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:118
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:120
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:119
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:117
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:116
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:118
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:115
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:114
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:117
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:116
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:113
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:115
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:114
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:112
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:111
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:113
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:110
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:109
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:112
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:111
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:108
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:110
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:109
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:107
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:106
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:108
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:105
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:104
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:107
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:106
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:103
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:105
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:104
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:102
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:101
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:103
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:100
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:99
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:102
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:101
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:98
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:100
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:99
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:97
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:96
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:98
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:95
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:94
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:97
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:96
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:93
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:95
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:94
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:92
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:91
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:93
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:90
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:89
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:92
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:91
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:88
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:90
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:89
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:87
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:86
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:88
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:85
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:84
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:87
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:86
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:83
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:85
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:84
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:82
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:81
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:83
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:80
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:79
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:82
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:81
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:78
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:80
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:79
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:77
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:76
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:78
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:75
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:74
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:77
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:76
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:73
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:75
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:74
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:72
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:71
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:73
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:70
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:69
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:72
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:71
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:68
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:70
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:69
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:67
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:66
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:68
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:65
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:64
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:67
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:66
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:63
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:65
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:64
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:62
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:61
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:63
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:60
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:59
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:62
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:61
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:58
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:60
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:59
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:57
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:56
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:58
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:55
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:54
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:57
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:56
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:53
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:55
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:54
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:52
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:51
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:53
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:50
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:49
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:52
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:51
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:48
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:50
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:49
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:47
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:46
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:48
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:45
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:44
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:47
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:46
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:43
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:45
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:44
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:42
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:41
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:43
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:40
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:39
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:42
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:41
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:38
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:40
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:39
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:37
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:36
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:38
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:35
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:34
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:37
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:36
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:33
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:35
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:34
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:32
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:31
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:33
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:30
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:29
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:32
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:31
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:28
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:30
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:29
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:27
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:26
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:28
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:25
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:24
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:27
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:26
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:23
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:25
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:24
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:22
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:21
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:23
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:20
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:19
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:22
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:21
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:18
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:20
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:19
|
|
; CHECK-NEXT: ds_read_u8 v3, v2 offset:16
|
|
; CHECK-NEXT: ds_read_u8 v5, v2 offset:17
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:18
|
|
; CHECK-NEXT: ds_read_u8 v4, v2 offset:8
|
|
; CHECK-NEXT: ds_read_u8 v6, v2 offset:9
|
|
; CHECK-NEXT: ds_read_u8 v7, v2 offset:10
|
|
; CHECK-NEXT: ds_read_u8 v8, v2 offset:11
|
|
; CHECK-NEXT: ds_read_u8 v9, v2 offset:12
|
|
; CHECK-NEXT: ds_read_u8 v10, v2 offset:13
|
|
; CHECK-NEXT: ds_read_u8 v11, v2 offset:14
|
|
; CHECK-NEXT: ds_read_u8 v12, v2 offset:15
|
|
; CHECK-NEXT: ds_read_u8 v13, v2
|
|
; CHECK-NEXT: ds_read_u8 v14, v2 offset:1
|
|
; CHECK-NEXT: ds_read_u8 v15, v2 offset:2
|
|
; CHECK-NEXT: ds_read_u8 v16, v2 offset:3
|
|
; CHECK-NEXT: ds_read_u8 v17, v2 offset:4
|
|
; CHECK-NEXT: ds_read_u8 v18, v2 offset:5
|
|
; CHECK-NEXT: ds_read_u8 v19, v2 offset:6
|
|
; CHECK-NEXT: ds_read_u8 v2, v2 offset:7
|
|
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v5 offset:17
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v3 offset:16
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v12 offset:15
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v11 offset:14
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v10 offset:13
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v9 offset:12
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v8 offset:11
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v7 offset:10
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v6 offset:9
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v4 offset:8
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v2 offset:7
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v19 offset:6
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v18 offset:5
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v17 offset:4
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v16 offset:3
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v15 offset:2
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v14 offset:1
|
|
; CHECK-NEXT: flat_store_byte v[0:1], v13
|
|
; CHECK-NEXT: s_endpgm
|
|
entry:
|
|
tail call void @llvm.memcpy.p0.p3.i64(ptr %generic, ptr addrspace(3) @shared, i64 128, i1 false)
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
declare void @llvm.memcpy.p0.p5.i64(ptr noalias nocapture writeonly, ptr addrspace(5) noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
declare void @llvm.memcpy.p1.p4.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
declare void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
declare void @llvm.memcpy.p3.p4.i64(ptr addrspace(3) noalias nocapture writeonly, ptr addrspace(4) noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
declare void @llvm.memcpy.p0.p3.i64(ptr noalias nocapture writeonly, ptr addrspace(3) noalias nocapture readonly, i64, i1 immarg) #2
|
|
|
|
attributes #0 = { minsize }
|
|
attributes #1 = { optsize }
|
|
attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
|