This was much more difficult than I anticipated. The pass is not in a good state, with poor test coverage. The legacy PM does seem to be relying on maintaining the map state between different SCCs, which seems bad. The pass is going out of its way to avoid putting the attributes it introduces onto non-callee functions. If it just added them, we could use them directly instead of relying on the map, I would think. The NewPM path uses a ModulePass; I'm not sure if we should be using CGSCC here but there seems to be some missing infrastructure to support backend defined ones.
492 lines
27 KiB
LLVM
492 lines
27 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
|
|
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
|
|
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
|
|
|
|
; GCN-LABEL: {{^}}test_membound:
|
|
; GCN: MemoryBound: 1
|
|
; GCN: WaveLimiterHint : 1
|
|
define amdgpu_kernel void @test_membound(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @test_membound(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: [[BB:.*:]]
|
|
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
|
|
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp2 = zext i32 %tmp to i64
|
|
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
|
|
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
|
|
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
|
|
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
|
|
%tmp6 = add nuw nsw i64 %tmp2, 1
|
|
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
|
|
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
|
|
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_membound_1:
|
|
; GCN: MemoryBound: 1
|
|
define amdgpu_kernel void @test_membound_1(ptr addrspace(1) nocapture readonly %ptr.0,
|
|
; CHECK-LABEL: define amdgpu_kernel void @test_membound_1(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[PTR_0:%.*]], ptr addrspace(1) nocapture [[PTR_1:%.*]], <2 x double> [[ARG_0:%.*]], i32 [[ARG_1:%.*]], <4 x double> [[ARG_2:%.*]]) #[[ATTR1:[0-9]+]] {
|
|
; CHECK-NEXT: [[BB_ENTRY:.*:]]
|
|
; CHECK-NEXT: [[ID_32:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: [[ID_0:%.*]] = zext i32 [[ID_32]] to i64
|
|
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_0]]
|
|
; CHECK-NEXT: [[LD_0:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_0]], align 16
|
|
; CHECK-NEXT: [[ADD_0:%.*]] = fadd <2 x double> [[ARG_0]], [[LD_0]]
|
|
; CHECK-NEXT: [[ID_1:%.*]] = add nuw nsw i64 [[ID_0]], 1
|
|
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_1]]
|
|
; CHECK-NEXT: [[LD_1:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_1]], align 16
|
|
; CHECK-NEXT: [[ADD_1:%.*]] = fadd <2 x double> [[ADD_0]], [[LD_1]]
|
|
; CHECK-NEXT: [[ID_2:%.*]] = add nuw nsw i64 [[ID_0]], 2
|
|
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_2]]
|
|
; CHECK-NEXT: [[LD_2:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_2]], align 16
|
|
; CHECK-NEXT: [[ADD_2:%.*]] = fadd <2 x double> [[ADD_1]], [[LD_2]]
|
|
; CHECK-NEXT: [[ID_3:%.*]] = add nuw nsw i64 [[ID_0]], 3
|
|
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_3]]
|
|
; CHECK-NEXT: [[LD_3:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_3]], align 16
|
|
; CHECK-NEXT: [[ADD_3:%.*]] = fadd <2 x double> [[ADD_2]], [[LD_3]]
|
|
; CHECK-NEXT: [[ID_4:%.*]] = add nuw nsw i64 [[ID_0]], 4
|
|
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_4]]
|
|
; CHECK-NEXT: [[LD_4:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_4]], align 16
|
|
; CHECK-NEXT: [[ADD_4:%.*]] = fadd <2 x double> [[ADD_3]], [[LD_4]]
|
|
; CHECK-NEXT: store <2 x double> [[ADD_4]], ptr addrspace(1) [[PTR_1]], align 16
|
|
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG_1]], 0
|
|
; CHECK-NEXT: br i1 [[COND]], label %[[BB_TRUE:.*]], label %[[BB_RET:.*]]
|
|
; CHECK: [[BB_TRUE]]:
|
|
; CHECK-NEXT: [[I0_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 0
|
|
; CHECK-NEXT: [[I1_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 1
|
|
; CHECK-NEXT: [[ADD_1_0:%.*]] = fadd double [[I0_ARG_0]], [[I1_ARG_0]]
|
|
; CHECK-NEXT: [[I0_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 0
|
|
; CHECK-NEXT: [[I1_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 1
|
|
; CHECK-NEXT: [[ADD_1_1:%.*]] = fadd double [[I0_ARG_2]], [[I1_ARG_2]]
|
|
; CHECK-NEXT: [[ADD_1_2:%.*]] = fadd double [[ADD_1_0]], [[ADD_1_1]]
|
|
; CHECK-NEXT: [[I2_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 2
|
|
; CHECK-NEXT: [[I3_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 3
|
|
; CHECK-NEXT: [[ADD_1_3:%.*]] = fadd double [[I2_ARG_2]], [[I3_ARG_2]]
|
|
; CHECK-NEXT: [[ADD_1_4:%.*]] = fadd double [[ADD_1_2]], [[ADD_1_3]]
|
|
; CHECK-NEXT: [[I0_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 0
|
|
; CHECK-NEXT: [[I1_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 1
|
|
; CHECK-NEXT: [[ADD_1_5:%.*]] = fadd double [[I0_ADD_0]], [[I1_ADD_0]]
|
|
; CHECK-NEXT: [[ADD_1_6:%.*]] = fadd double [[ADD_1_4]], [[ADD_1_5]]
|
|
; CHECK-NEXT: [[I0_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 0
|
|
; CHECK-NEXT: [[I1_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 1
|
|
; CHECK-NEXT: [[ADD_1_7:%.*]] = fadd double [[I0_ADD_1]], [[I1_ADD_1]]
|
|
; CHECK-NEXT: [[ADD_1_8:%.*]] = fadd double [[ADD_1_6]], [[ADD_1_7]]
|
|
; CHECK-NEXT: [[I0_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 0
|
|
; CHECK-NEXT: [[I1_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 1
|
|
; CHECK-NEXT: [[ADD_1_9:%.*]] = fadd double [[I0_ADD_2]], [[I1_ADD_2]]
|
|
; CHECK-NEXT: [[ADD_1_10:%.*]] = fadd double [[ADD_1_8]], [[ADD_1_9]]
|
|
; CHECK-NEXT: store double [[ADD_1_8]], ptr addrspace(1) [[PTR_1]], align 8
|
|
; CHECK-NEXT: br label %[[BB_RET]]
|
|
; CHECK: [[BB_RET]]:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
ptr addrspace(1) nocapture %ptr.1,
|
|
<2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
|
|
bb.entry:
|
|
%id.32 = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%id.0 = zext i32 %id.32 to i64
|
|
%gep.0 = getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.0
|
|
%ld.0 = load <2 x double>, ptr addrspace(1) %gep.0, align 16
|
|
%add.0 = fadd <2 x double> %arg.0, %ld.0
|
|
|
|
%id.1 = add nuw nsw i64 %id.0, 1
|
|
%gep.1 = getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.1
|
|
%ld.1 = load <2 x double>, ptr addrspace(1) %gep.1, align 16
|
|
%add.1 = fadd <2 x double> %add.0, %ld.1
|
|
|
|
%id.2 = add nuw nsw i64 %id.0, 2
|
|
%gep.2 = getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.2
|
|
%ld.2 = load <2 x double>, ptr addrspace(1) %gep.2, align 16
|
|
%add.2 = fadd <2 x double> %add.1, %ld.2
|
|
|
|
%id.3 = add nuw nsw i64 %id.0, 3
|
|
%gep.3= getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.3
|
|
%ld.3 = load <2 x double>, ptr addrspace(1) %gep.3, align 16
|
|
%add.3 = fadd <2 x double> %add.2, %ld.3
|
|
|
|
%id.4 = add nuw nsw i64 %id.0, 4
|
|
%gep.4= getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.4
|
|
%ld.4 = load <2 x double>, ptr addrspace(1) %gep.4, align 16
|
|
%add.4 = fadd <2 x double> %add.3, %ld.4
|
|
|
|
store <2 x double> %add.4, ptr addrspace(1) %ptr.1, align 16
|
|
%cond = icmp eq i32 %arg.1, 0
|
|
br i1 %cond, label %bb.true, label %bb.ret
|
|
|
|
bb.true:
|
|
%i0.arg.0 = extractelement <2 x double> %arg.0, i32 0
|
|
%i1.arg.0 = extractelement <2 x double> %arg.0, i32 1
|
|
%add.1.0 = fadd double %i0.arg.0, %i1.arg.0
|
|
%i0.arg.2 = extractelement <4 x double> %arg.2, i32 0
|
|
%i1.arg.2 = extractelement <4 x double> %arg.2, i32 1
|
|
%add.1.1 = fadd double %i0.arg.2, %i1.arg.2
|
|
%add.1.2 = fadd double %add.1.0, %add.1.1
|
|
%i2.arg.2 = extractelement <4 x double> %arg.2, i32 2
|
|
%i3.arg.2 = extractelement <4 x double> %arg.2, i32 3
|
|
%add.1.3 = fadd double %i2.arg.2, %i3.arg.2
|
|
%add.1.4 = fadd double %add.1.2, %add.1.3
|
|
%i0.add.0 = extractelement <2 x double> %add.0, i32 0
|
|
%i1.add.0 = extractelement <2 x double> %add.0, i32 1
|
|
%add.1.5 = fadd double %i0.add.0, %i1.add.0
|
|
%add.1.6 = fadd double %add.1.4, %add.1.5
|
|
%i0.add.1 = extractelement <2 x double> %add.1, i32 0
|
|
%i1.add.1 = extractelement <2 x double> %add.1, i32 1
|
|
%add.1.7 = fadd double %i0.add.1, %i1.add.1
|
|
%add.1.8 = fadd double %add.1.6, %add.1.7
|
|
%i0.add.2 = extractelement <2 x double> %add.2, i32 0
|
|
%i1.add.2 = extractelement <2 x double> %add.2, i32 1
|
|
%add.1.9 = fadd double %i0.add.2, %i1.add.2
|
|
%add.1.10 = fadd double %add.1.8, %add.1.9
|
|
|
|
store double %add.1.8, ptr addrspace(1) %ptr.1, align 8
|
|
br label %bb.ret
|
|
|
|
bb.ret:
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_large_stride:
|
|
; GCN: MemoryBound: 0
|
|
; GCN: WaveLimiterHint : 1
|
|
define amdgpu_kernel void @test_large_stride(ptr addrspace(1) nocapture %arg) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @test_large_stride(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
|
|
; CHECK-NEXT: [[BB:.*:]]
|
|
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4096
|
|
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP]], align 4
|
|
; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[TMP1]], [[TMP1]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
|
|
; CHECK-NEXT: store i32 [[MUL1]], ptr addrspace(1) [[TMP2]], align 4
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 8192
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP3]], align 4
|
|
; CHECK-NEXT: [[MUL4:%.*]] = mul i32 [[TMP4]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
|
|
; CHECK-NEXT: store i32 [[MUL4]], ptr addrspace(1) [[TMP5]], align 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 12288
|
|
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
|
|
; CHECK-NEXT: [[MUL7:%.*]] = mul i32 [[TMP7]], [[TMP7]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
|
|
; CHECK-NEXT: store i32 [[MUL7]], ptr addrspace(1) [[TMP8]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4096
|
|
%tmp1 = load i32, ptr addrspace(1) %tmp, align 4
|
|
%mul1 = mul i32 %tmp1, %tmp1
|
|
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
|
|
store i32 %mul1, ptr addrspace(1) %tmp2, align 4
|
|
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 8192
|
|
%tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
|
|
%mul4 = mul i32 %tmp4, %tmp4
|
|
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
|
|
store i32 %mul4, ptr addrspace(1) %tmp5, align 4
|
|
%tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 12288
|
|
%tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
|
|
%mul7 = mul i32 %tmp7, %tmp7
|
|
%tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
|
|
store i32 %mul7, ptr addrspace(1) %tmp8, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_indirect:
|
|
; GCN: MemoryBound: 1
|
|
; GCN: WaveLimiterHint : 1
|
|
define amdgpu_kernel void @test_indirect(ptr addrspace(1) nocapture %arg) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @test_indirect(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2]] {
|
|
; CHECK-NEXT: [[BB:.*:]]
|
|
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
|
|
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
|
|
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG]], align 4
|
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
|
|
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
|
|
; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[ARG]], align 4
|
|
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
|
|
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
|
|
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP10]]
|
|
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) [[TMP11]], align 4
|
|
; CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP]], align 4
|
|
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
|
|
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
|
|
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP14]]
|
|
; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(1) [[TMP15]], align 4
|
|
; CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP1]], align 4
|
|
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
|
|
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
|
|
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP18]]
|
|
; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) [[TMP19]], align 4
|
|
; CHECK-NEXT: store i32 [[TMP20]], ptr addrspace(1) [[TMP2]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
|
|
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
|
|
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
|
|
%tmp4 = load <4 x i32>, ptr addrspace(1) %arg, align 4
|
|
%tmp5 = extractelement <4 x i32> %tmp4, i32 0
|
|
%tmp6 = sext i32 %tmp5 to i64
|
|
%tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load i32, ptr addrspace(1) %tmp7, align 4
|
|
store i32 %tmp8, ptr addrspace(1) %arg, align 4
|
|
%tmp9 = extractelement <4 x i32> %tmp4, i32 1
|
|
%tmp10 = sext i32 %tmp9 to i64
|
|
%tmp11 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp10
|
|
%tmp12 = load i32, ptr addrspace(1) %tmp11, align 4
|
|
store i32 %tmp12, ptr addrspace(1) %tmp, align 4
|
|
%tmp13 = extractelement <4 x i32> %tmp4, i32 2
|
|
%tmp14 = sext i32 %tmp13 to i64
|
|
%tmp15 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp14
|
|
%tmp16 = load i32, ptr addrspace(1) %tmp15, align 4
|
|
store i32 %tmp16, ptr addrspace(1) %tmp1, align 4
|
|
%tmp17 = extractelement <4 x i32> %tmp4, i32 3
|
|
%tmp18 = sext i32 %tmp17 to i64
|
|
%tmp19 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp18
|
|
%tmp20 = load i32, ptr addrspace(1) %tmp19, align 4
|
|
store i32 %tmp20, ptr addrspace(1) %tmp2, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_indirect_through_phi:
|
|
; GCN: MemoryBound: 0
|
|
; GCN: WaveLimiterHint : 0
|
|
define amdgpu_kernel void @test_indirect_through_phi(ptr addrspace(1) %arg) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @test_indirect_through_phi(
|
|
; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]]) {
|
|
; CHECK-NEXT: [[BB:.*]]:
|
|
; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(1) [[ARG]], align 8
|
|
; CHECK-NEXT: [[LOAD_F:%.*]] = bitcast float [[LOAD]] to i32
|
|
; CHECK-NEXT: [[N:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: br label %[[BB1:.*]]
|
|
; CHECK: [[BB1]]:
|
|
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD_F]], %[[BB]] ], [ [[AND2:%.*]], %[[BB1]] ]
|
|
; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[INC2:%.*]], %[[BB1]] ]
|
|
; CHECK-NEXT: [[AND1:%.*]] = and i32 [[PHI]], [[N]]
|
|
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i32 [[AND1]]
|
|
; CHECK-NEXT: store float [[LOAD]], ptr addrspace(1) [[GEP]], align 4
|
|
; CHECK-NEXT: [[INC1:%.*]] = add nsw i32 [[PHI]], 1310720
|
|
; CHECK-NEXT: [[AND2]] = and i32 [[INC1]], [[N]]
|
|
; CHECK-NEXT: [[INC2]] = add nuw nsw i32 [[IND]], 1
|
|
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC2]], 1024
|
|
; CHECK-NEXT: br i1 [[CMP]], label %[[BB2:.*]], label %[[BB1]]
|
|
; CHECK: [[BB2]]:
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%load = load float, ptr addrspace(1) %arg, align 8
|
|
%load.f = bitcast float %load to i32
|
|
%n = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
br label %bb1
|
|
|
|
bb1: ; preds = %bb1, %bb
|
|
%phi = phi i32 [ %load.f, %bb ], [ %and2, %bb1 ]
|
|
%ind = phi i32 [ 0, %bb ], [ %inc2, %bb1 ]
|
|
%and1 = and i32 %phi, %n
|
|
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %and1
|
|
store float %load, ptr addrspace(1) %gep, align 4
|
|
%inc1 = add nsw i32 %phi, 1310720
|
|
%and2 = and i32 %inc1, %n
|
|
%inc2 = add nuw nsw i32 %ind, 1
|
|
%cmp = icmp eq i32 %inc2, 1024
|
|
br i1 %cmp, label %bb2, label %bb1
|
|
|
|
bb2: ; preds = %bb1
|
|
ret void
|
|
}
|
|
|
|
define void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
|
|
; CHECK-LABEL: define void @test_membound_func(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[BB:.*:]]
|
|
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
|
|
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp2 = zext i32 %tmp to i64
|
|
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
|
|
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
|
|
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
|
|
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
|
|
%tmp6 = add nuw nsw i64 %tmp2, 1
|
|
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
|
|
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
|
|
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}kernel_call_test_membound_func:
|
|
; GCN: MemoryBound: 1
|
|
; GCN: WaveLimiterHint : 1
|
|
define amdgpu_kernel void @kernel_call_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_call_test_membound_func(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
|
|
ret void
|
|
}
|
|
|
|
; TODO: Probably should assume yes?
|
|
; GCN-LABEL: {{^}}kernel_indirect_call:
|
|
; GCN: MemoryBound: 0
|
|
; GCN: WaveLimiterHint : 0
|
|
define amdgpu_kernel void @kernel_indirect_call(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_indirect_call(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) {
|
|
; CHECK-NEXT: call void [[FPTR]](ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void %fptr(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
|
|
ret void
|
|
}
|
|
|
|
declare void @extern()
|
|
|
|
define void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
|
|
; CHECK-LABEL: define void @maybe_recursive_test_membound_func(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[BB:.*:]]
|
|
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
|
|
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
|
|
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
|
|
; CHECK-NEXT: call void @extern()
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
bb:
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp2 = zext i32 %tmp to i64
|
|
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
|
|
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
|
|
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
|
|
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
|
|
%tmp6 = add nuw nsw i64 %tmp2, 1
|
|
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
|
|
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
|
|
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
|
|
call void @extern()
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}kernel_call_maybe_recursive_test_membound_func:
|
|
; GCN: MemoryBound: 1
|
|
; GCN: WaveLimiterHint : 1
|
|
define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
|
|
ret void
|
|
}
|
|
|
|
define void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
|
|
; CHECK-LABEL: define void @mutually_recursive_test_membound_func_0(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
|
|
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
|
|
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
|
|
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
|
|
; CHECK-NEXT: call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp2 = zext i32 %tmp to i64
|
|
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
|
|
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
|
|
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
|
|
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
|
|
call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
|
|
ret void
|
|
}
|
|
|
|
define void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
|
|
; CHECK-LABEL: define void @mutually_recursive_test_membound_func_1(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
|
|
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
|
|
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
|
|
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
|
|
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
|
|
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
|
|
; CHECK-NEXT: call void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%tmp2 = zext i32 %tmp to i64
|
|
%tmp6 = add nuw nsw i64 %tmp2, 1
|
|
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
|
|
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
|
|
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
|
|
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
|
|
call void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}kernel_call_mutually_recursive_test_membound_func_0:
|
|
; GCN: MemoryBound: 1
|
|
; GCN: WaveLimiterHint : 1
|
|
define amdgpu_kernel void @kernel_call_mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
|
|
; CHECK-LABEL: define amdgpu_kernel void @kernel_call_mutually_recursive_test_membound_func_0(
|
|
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x()
|
|
|
|
|
|
;.
|
|
; CHECK: attributes #[[ATTR0]] = { "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" }
|
|
; CHECK: attributes #[[ATTR1]] = { "amdgpu-memory-bound"="true" }
|
|
; CHECK: attributes #[[ATTR2]] = { "amdgpu-wave-limiter"="true" }
|
|
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
|
|
;.
|