clang-p2996/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s

; Test case looks at the allocated offset of @used_by_both. It's at zero when
; allocated by itself, but at 8 when allocated in combination with the double.
; Redundantly also checks LDSByteSize.
@used_by_both = addrspace(3) global i32 undef
@used_by_kernel = addrspace(3) global i32 undef
@used_by_function = addrspace(3) global double undef

; kernel that calls no functions and uses an LDS variable allocates only that
; variable, so accesses at at offset 0 and LDSByteSize is 4
define amdgpu_kernel void @nocall_ideal() {
; CHECK-LABEL: nocall_ideal:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    ds_write_b32 v0, v0
; CHECK-NEXT:    s_endpgm
store i32 0, ptr addrspace(3) @used_by_kernel
  ret void
}
; CHECK: ; LDSByteSize: 4 bytes

define void @nonkernel() {
; GFX9-LABEL: nonkernel:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT:    v_mov_b32_e32 v0, 0
; GFX9-NEXT:    v_mov_b32_e32 v1, v0
; GFX9-NEXT:    ds_write_b32 v0, v0 offset:8
; GFX9-NEXT:    ds_write_b64 v0, v[0:1]
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; GFX10-LABEL: nonkernel:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT:    v_mov_b32_e32 v0, 0
; GFX10-NEXT:    v_mov_b32_e32 v1, v0
; GFX10-NEXT:    ds_write_b32 v0, v0 offset:8
; GFX10-NEXT:    ds_write_b64 v0, v[0:1]
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_setpc_b64 s[30:31]
;
; G_GFX9-LABEL: nonkernel:
; G_GFX9:       ; %bb.0:
; G_GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; G_GFX9-NEXT:    v_mov_b32_e32 v2, 0
; G_GFX9-NEXT:    v_mov_b32_e32 v3, 8
; G_GFX9-NEXT:    v_mov_b32_e32 v0, 0
; G_GFX9-NEXT:    v_mov_b32_e32 v1, 0
; G_GFX9-NEXT:    ds_write_b32 v3, v2
; G_GFX9-NEXT:    ds_write_b64 v2, v[0:1]
; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; G_GFX9-NEXT:    s_setpc_b64 s[30:31]
;
; G_GFX10-LABEL: nonkernel:
; G_GFX10:       ; %bb.0:
; G_GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; G_GFX10-NEXT:    v_mov_b32_e32 v2, 0
; G_GFX10-NEXT:    v_mov_b32_e32 v3, 8
; G_GFX10-NEXT:    v_mov_b32_e32 v0, 0
; G_GFX10-NEXT:    v_mov_b32_e32 v1, 0
; G_GFX10-NEXT:    ds_write_b32 v3, v2
; G_GFX10-NEXT:    ds_write_b64 v2, v[0:1]
; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT:    s_setpc_b64 s[30:31]
  store i32 0, ptr addrspace(3) @used_by_both
  store double 0.0, ptr addrspace(3) @used_by_function
  ret void
}

; Needs to allocate both variables, store to used_by_both is at sizeof(double)
define amdgpu_kernel void @withcall() {
; GFX9-LABEL: withcall:
; GFX9:       ; %bb.0:
; GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX9-NEXT:    s_mov_b32 s22, -1
; GFX9-NEXT:    s_mov_b32 s23, 0xe00000
; GFX9-NEXT:    s_add_u32 s20, s20, s9
; GFX9-NEXT:    s_addc_u32 s21, s21, 0
; GFX9-NEXT:    s_mov_b32 s14, s8
; GFX9-NEXT:    s_add_u32 s8, s2, 36
; GFX9-NEXT:    s_addc_u32 s9, s3, 0
; GFX9-NEXT:    s_getpc_b64 s[2:3]
; GFX9-NEXT:    s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
; GFX9-NEXT:    s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
; GFX9-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
; GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX9-NEXT:    s_mov_b64 s[0:1], s[20:21]
; GFX9-NEXT:    v_mov_b32_e32 v3, 0
; GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
; GFX9-NEXT:    s_mov_b32 s12, s6
; GFX9-NEXT:    s_mov_b32 s13, s7
; GFX9-NEXT:    s_mov_b64 s[2:3], s[22:23]
; GFX9-NEXT:    s_mov_b32 s32, 0
; GFX9-NEXT:    ds_write_b32 v3, v3 offset:8
; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX9-NEXT:    s_endpgm
;
; GFX10-LABEL: withcall:
; GFX10:       ; %bb.0:
; GFX10-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; GFX10-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; GFX10-NEXT:    s_mov_b32 s22, -1
; GFX10-NEXT:    s_mov_b32 s23, 0x31c16000
; GFX10-NEXT:    s_add_u32 s20, s20, s9
; GFX10-NEXT:    s_addc_u32 s21, s21, 0
; GFX10-NEXT:    s_mov_b32 s14, s8
; GFX10-NEXT:    s_add_u32 s8, s2, 36
; GFX10-NEXT:    s_addc_u32 s9, s3, 0
; GFX10-NEXT:    s_getpc_b64 s[2:3]
; GFX10-NEXT:    s_add_u32 s2, s2, nonkernel@gotpcrel32@lo+4
; GFX10-NEXT:    s_addc_u32 s3, s3, nonkernel@gotpcrel32@hi+12
; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
; GFX10-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x0
; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
; GFX10-NEXT:    v_mov_b32_e32 v3, 0
; GFX10-NEXT:    s_mov_b64 s[10:11], s[4:5]
; GFX10-NEXT:    s_mov_b64 s[4:5], s[0:1]
; GFX10-NEXT:    s_mov_b64 s[0:1], s[20:21]
; GFX10-NEXT:    v_or3_b32 v31, v0, v1, v2
; GFX10-NEXT:    s_mov_b32 s12, s6
; GFX10-NEXT:    s_mov_b32 s13, s7
; GFX10-NEXT:    s_mov_b64 s[2:3], s[22:23]
; GFX10-NEXT:    s_mov_b32 s32, 0
; GFX10-NEXT:    ds_write_b32 v3, v3 offset:8
; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; GFX10-NEXT:    s_endpgm
;
; G_GFX9-LABEL: withcall:
; G_GFX9:       ; %bb.0:
; G_GFX9-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; G_GFX9-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; G_GFX9-NEXT:    s_mov_b32 s22, -1
; G_GFX9-NEXT:    s_mov_b32 s23, 0xe00000
; G_GFX9-NEXT:    s_add_u32 s20, s20, s9
; G_GFX9-NEXT:    s_addc_u32 s21, s21, 0
; G_GFX9-NEXT:    s_mov_b32 s14, s8
; G_GFX9-NEXT:    s_add_u32 s8, s2, 36
; G_GFX9-NEXT:    s_addc_u32 s9, s3, 0
; G_GFX9-NEXT:    s_mov_b64 s[10:11], s[4:5]
; G_GFX9-NEXT:    s_mov_b64 s[4:5], s[0:1]
; G_GFX9-NEXT:    s_getpc_b64 s[0:1]
; G_GFX9-NEXT:    s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; G_GFX9-NEXT:    s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; G_GFX9-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
; G_GFX9-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
; G_GFX9-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
; G_GFX9-NEXT:    s_mov_b64 s[0:1], s[20:21]
; G_GFX9-NEXT:    v_mov_b32_e32 v3, 0
; G_GFX9-NEXT:    v_mov_b32_e32 v4, 8
; G_GFX9-NEXT:    v_or3_b32 v31, v0, v1, v2
; G_GFX9-NEXT:    s_mov_b64 s[2:3], s[22:23]
; G_GFX9-NEXT:    s_mov_b32 s12, s6
; G_GFX9-NEXT:    s_mov_b32 s13, s7
; G_GFX9-NEXT:    s_mov_b32 s32, 0
; G_GFX9-NEXT:    ds_write_b32 v4, v3
; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
; G_GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; G_GFX9-NEXT:    s_endpgm
;
; G_GFX10-LABEL: withcall:
; G_GFX10:       ; %bb.0:
; G_GFX10-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
; G_GFX10-NEXT:    s_mov_b32 s21, SCRATCH_RSRC_DWORD1
; G_GFX10-NEXT:    s_mov_b32 s22, -1
; G_GFX10-NEXT:    s_mov_b32 s23, 0x31c16000
; G_GFX10-NEXT:    s_add_u32 s20, s20, s9
; G_GFX10-NEXT:    s_addc_u32 s21, s21, 0
; G_GFX10-NEXT:    s_mov_b32 s14, s8
; G_GFX10-NEXT:    s_add_u32 s8, s2, 36
; G_GFX10-NEXT:    s_addc_u32 s9, s3, 0
; G_GFX10-NEXT:    s_mov_b64 s[10:11], s[4:5]
; G_GFX10-NEXT:    s_mov_b64 s[4:5], s[0:1]
; G_GFX10-NEXT:    s_getpc_b64 s[0:1]
; G_GFX10-NEXT:    s_add_u32 s0, s0, nonkernel@gotpcrel32@lo+4
; G_GFX10-NEXT:    s_addc_u32 s1, s1, nonkernel@gotpcrel32@hi+12
; G_GFX10-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
; G_GFX10-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
; G_GFX10-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
; G_GFX10-NEXT:    v_mov_b32_e32 v3, 0
; G_GFX10-NEXT:    v_mov_b32_e32 v4, 8
; G_GFX10-NEXT:    s_mov_b64 s[0:1], s[20:21]
; G_GFX10-NEXT:    s_mov_b64 s[2:3], s[22:23]
; G_GFX10-NEXT:    v_or3_b32 v31, v0, v1, v2
; G_GFX10-NEXT:    s_mov_b32 s12, s6
; G_GFX10-NEXT:    s_mov_b32 s13, s7
; G_GFX10-NEXT:    s_mov_b32 s32, 0
; G_GFX10-NEXT:    ds_write_b32 v4, v3
; G_GFX10-NEXT:    s_waitcnt lgkmcnt(0)
; G_GFX10-NEXT:    s_swappc_b64 s[30:31], s[16:17]
; G_GFX10-NEXT:    s_endpgm
  store i32 0, ptr addrspace(3) @used_by_both
  call void @nonkernel()
  ret void
}
; CHECK: ; LDSByteSize: 16 bytes

; Previous lowering was less efficient here than necessary as the i32 used
; by the kernel is also used by an unrelated non-kernel function. Codegen
; is now the same as nocall_ideal.
define amdgpu_kernel void @nocall_false_sharing() {
; CHECK-LABEL: nocall_false_sharing:
; CHECK:       ; %bb.0:
; CHECK-NEXT:    v_mov_b32_e32 v0, 0
; CHECK-NEXT:    ds_write_b32 v0, v0
; CHECK-NEXT:    s_endpgm
  store i32 0, ptr addrspace(3) @used_by_both
  ret void
}
; CHECK: ; LDSByteSize: 4 bytes

!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}