Files
clang-p2996/llvm/test/CodeGen/AMDGPU/perfhint.ll
Matt Arsenault dd094b2647 NewPM/AMDGPU: Port AMDGPUPerfHintAnalysis to new pass manager (#102645)
This was much more difficult than I anticipated. The pass is
not in a good state, with poor test coverage. The legacy PM
does seem to be relying on maintaining the map state between
different SCCs, which seems bad. The pass is going out of its
way to avoid putting the attributes it introduces onto non-callee
functions. If it just added them, we could use them directly
instead of relying on the map, I would think.

The NewPM path uses a ModulePass; I'm not sure if we should be
using CGSCC here but there seems to be some missing infrastructure
to support backend defined ones.
2024-08-11 15:11:10 +04:00

492 lines
27 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-perf-hint < %s | FileCheck -check-prefix=CHECK %s
; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_membound:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_membound(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
; CHECK-LABEL: define amdgpu_kernel void @test_membound(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
; CHECK-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = zext i32 %tmp to i64
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
%tmp6 = add nuw nsw i64 %tmp2, 1
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
ret void
}
; GCN-LABEL: {{^}}test_membound_1:
; GCN: MemoryBound: 1
define amdgpu_kernel void @test_membound_1(ptr addrspace(1) nocapture readonly %ptr.0,
; CHECK-LABEL: define amdgpu_kernel void @test_membound_1(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[PTR_0:%.*]], ptr addrspace(1) nocapture [[PTR_1:%.*]], <2 x double> [[ARG_0:%.*]], i32 [[ARG_1:%.*]], <4 x double> [[ARG_2:%.*]]) #[[ATTR1:[0-9]+]] {
; CHECK-NEXT: [[BB_ENTRY:.*:]]
; CHECK-NEXT: [[ID_32:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[ID_0:%.*]] = zext i32 [[ID_32]] to i64
; CHECK-NEXT: [[GEP_0:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_0]]
; CHECK-NEXT: [[LD_0:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_0]], align 16
; CHECK-NEXT: [[ADD_0:%.*]] = fadd <2 x double> [[ARG_0]], [[LD_0]]
; CHECK-NEXT: [[ID_1:%.*]] = add nuw nsw i64 [[ID_0]], 1
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_1]]
; CHECK-NEXT: [[LD_1:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_1]], align 16
; CHECK-NEXT: [[ADD_1:%.*]] = fadd <2 x double> [[ADD_0]], [[LD_1]]
; CHECK-NEXT: [[ID_2:%.*]] = add nuw nsw i64 [[ID_0]], 2
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_2]]
; CHECK-NEXT: [[LD_2:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_2]], align 16
; CHECK-NEXT: [[ADD_2:%.*]] = fadd <2 x double> [[ADD_1]], [[LD_2]]
; CHECK-NEXT: [[ID_3:%.*]] = add nuw nsw i64 [[ID_0]], 3
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_3]]
; CHECK-NEXT: [[LD_3:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_3]], align 16
; CHECK-NEXT: [[ADD_3:%.*]] = fadd <2 x double> [[ADD_2]], [[LD_3]]
; CHECK-NEXT: [[ID_4:%.*]] = add nuw nsw i64 [[ID_0]], 4
; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds <2 x double>, ptr addrspace(1) [[PTR_0]], i64 [[ID_4]]
; CHECK-NEXT: [[LD_4:%.*]] = load <2 x double>, ptr addrspace(1) [[GEP_4]], align 16
; CHECK-NEXT: [[ADD_4:%.*]] = fadd <2 x double> [[ADD_3]], [[LD_4]]
; CHECK-NEXT: store <2 x double> [[ADD_4]], ptr addrspace(1) [[PTR_1]], align 16
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG_1]], 0
; CHECK-NEXT: br i1 [[COND]], label %[[BB_TRUE:.*]], label %[[BB_RET:.*]]
; CHECK: [[BB_TRUE]]:
; CHECK-NEXT: [[I0_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 0
; CHECK-NEXT: [[I1_ARG_0:%.*]] = extractelement <2 x double> [[ARG_0]], i32 1
; CHECK-NEXT: [[ADD_1_0:%.*]] = fadd double [[I0_ARG_0]], [[I1_ARG_0]]
; CHECK-NEXT: [[I0_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 0
; CHECK-NEXT: [[I1_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 1
; CHECK-NEXT: [[ADD_1_1:%.*]] = fadd double [[I0_ARG_2]], [[I1_ARG_2]]
; CHECK-NEXT: [[ADD_1_2:%.*]] = fadd double [[ADD_1_0]], [[ADD_1_1]]
; CHECK-NEXT: [[I2_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 2
; CHECK-NEXT: [[I3_ARG_2:%.*]] = extractelement <4 x double> [[ARG_2]], i32 3
; CHECK-NEXT: [[ADD_1_3:%.*]] = fadd double [[I2_ARG_2]], [[I3_ARG_2]]
; CHECK-NEXT: [[ADD_1_4:%.*]] = fadd double [[ADD_1_2]], [[ADD_1_3]]
; CHECK-NEXT: [[I0_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 0
; CHECK-NEXT: [[I1_ADD_0:%.*]] = extractelement <2 x double> [[ADD_0]], i32 1
; CHECK-NEXT: [[ADD_1_5:%.*]] = fadd double [[I0_ADD_0]], [[I1_ADD_0]]
; CHECK-NEXT: [[ADD_1_6:%.*]] = fadd double [[ADD_1_4]], [[ADD_1_5]]
; CHECK-NEXT: [[I0_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 0
; CHECK-NEXT: [[I1_ADD_1:%.*]] = extractelement <2 x double> [[ADD_1]], i32 1
; CHECK-NEXT: [[ADD_1_7:%.*]] = fadd double [[I0_ADD_1]], [[I1_ADD_1]]
; CHECK-NEXT: [[ADD_1_8:%.*]] = fadd double [[ADD_1_6]], [[ADD_1_7]]
; CHECK-NEXT: [[I0_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 0
; CHECK-NEXT: [[I1_ADD_2:%.*]] = extractelement <2 x double> [[ADD_2]], i32 1
; CHECK-NEXT: [[ADD_1_9:%.*]] = fadd double [[I0_ADD_2]], [[I1_ADD_2]]
; CHECK-NEXT: [[ADD_1_10:%.*]] = fadd double [[ADD_1_8]], [[ADD_1_9]]
; CHECK-NEXT: store double [[ADD_1_8]], ptr addrspace(1) [[PTR_1]], align 8
; CHECK-NEXT: br label %[[BB_RET]]
; CHECK: [[BB_RET]]:
; CHECK-NEXT: ret void
;
ptr addrspace(1) nocapture %ptr.1,
<2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {
bb.entry:
%id.32 = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.0 = zext i32 %id.32 to i64
%gep.0 = getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.0
%ld.0 = load <2 x double>, ptr addrspace(1) %gep.0, align 16
%add.0 = fadd <2 x double> %arg.0, %ld.0
%id.1 = add nuw nsw i64 %id.0, 1
%gep.1 = getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.1
%ld.1 = load <2 x double>, ptr addrspace(1) %gep.1, align 16
%add.1 = fadd <2 x double> %add.0, %ld.1
%id.2 = add nuw nsw i64 %id.0, 2
%gep.2 = getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.2
%ld.2 = load <2 x double>, ptr addrspace(1) %gep.2, align 16
%add.2 = fadd <2 x double> %add.1, %ld.2
%id.3 = add nuw nsw i64 %id.0, 3
%gep.3= getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.3
%ld.3 = load <2 x double>, ptr addrspace(1) %gep.3, align 16
%add.3 = fadd <2 x double> %add.2, %ld.3
%id.4 = add nuw nsw i64 %id.0, 4
%gep.4= getelementptr inbounds <2 x double>, ptr addrspace(1) %ptr.0, i64 %id.4
%ld.4 = load <2 x double>, ptr addrspace(1) %gep.4, align 16
%add.4 = fadd <2 x double> %add.3, %ld.4
store <2 x double> %add.4, ptr addrspace(1) %ptr.1, align 16
%cond = icmp eq i32 %arg.1, 0
br i1 %cond, label %bb.true, label %bb.ret
bb.true:
%i0.arg.0 = extractelement <2 x double> %arg.0, i32 0
%i1.arg.0 = extractelement <2 x double> %arg.0, i32 1
%add.1.0 = fadd double %i0.arg.0, %i1.arg.0
%i0.arg.2 = extractelement <4 x double> %arg.2, i32 0
%i1.arg.2 = extractelement <4 x double> %arg.2, i32 1
%add.1.1 = fadd double %i0.arg.2, %i1.arg.2
%add.1.2 = fadd double %add.1.0, %add.1.1
%i2.arg.2 = extractelement <4 x double> %arg.2, i32 2
%i3.arg.2 = extractelement <4 x double> %arg.2, i32 3
%add.1.3 = fadd double %i2.arg.2, %i3.arg.2
%add.1.4 = fadd double %add.1.2, %add.1.3
%i0.add.0 = extractelement <2 x double> %add.0, i32 0
%i1.add.0 = extractelement <2 x double> %add.0, i32 1
%add.1.5 = fadd double %i0.add.0, %i1.add.0
%add.1.6 = fadd double %add.1.4, %add.1.5
%i0.add.1 = extractelement <2 x double> %add.1, i32 0
%i1.add.1 = extractelement <2 x double> %add.1, i32 1
%add.1.7 = fadd double %i0.add.1, %i1.add.1
%add.1.8 = fadd double %add.1.6, %add.1.7
%i0.add.2 = extractelement <2 x double> %add.2, i32 0
%i1.add.2 = extractelement <2 x double> %add.2, i32 1
%add.1.9 = fadd double %i0.add.2, %i1.add.2
%add.1.10 = fadd double %add.1.8, %add.1.9
store double %add.1.8, ptr addrspace(1) %ptr.1, align 8
br label %bb.ret
bb.ret:
ret void
}
; GCN-LABEL: {{^}}test_large_stride:
; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_large_stride(ptr addrspace(1) nocapture %arg) {
; CHECK-LABEL: define amdgpu_kernel void @test_large_stride(
; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2:[0-9]+]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4096
; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[TMP]], align 4
; CHECK-NEXT: [[MUL1:%.*]] = mul i32 [[TMP1]], [[TMP1]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
; CHECK-NEXT: store i32 [[MUL1]], ptr addrspace(1) [[TMP2]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 8192
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(1) [[TMP3]], align 4
; CHECK-NEXT: [[MUL4:%.*]] = mul i32 [[TMP4]], [[TMP4]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
; CHECK-NEXT: store i32 [[MUL4]], ptr addrspace(1) [[TMP5]], align 4
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 12288
; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4
; CHECK-NEXT: [[MUL7:%.*]] = mul i32 [[TMP7]], [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
; CHECK-NEXT: store i32 [[MUL7]], ptr addrspace(1) [[TMP8]], align 4
; CHECK-NEXT: ret void
;
bb:
%tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4096
%tmp1 = load i32, ptr addrspace(1) %tmp, align 4
%mul1 = mul i32 %tmp1, %tmp1
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
store i32 %mul1, ptr addrspace(1) %tmp2, align 4
%tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 8192
%tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
%mul4 = mul i32 %tmp4, %tmp4
%tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
store i32 %mul4, ptr addrspace(1) %tmp5, align 4
%tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 12288
%tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
%mul7 = mul i32 %tmp7, %tmp7
%tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
store i32 %mul7, ptr addrspace(1) %tmp8, align 4
ret void
}
; GCN-LABEL: {{^}}test_indirect:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @test_indirect(ptr addrspace(1) nocapture %arg) {
; CHECK-LABEL: define amdgpu_kernel void @test_indirect(
; CHECK-SAME: ptr addrspace(1) nocapture [[ARG:%.*]]) #[[ATTR2]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP7]], align 4
; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) [[ARG]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) [[TMP11]], align 4
; CHECK-NEXT: store i32 [[TMP12]], ptr addrspace(1) [[TMP]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr addrspace(1) [[TMP15]], align 4
; CHECK-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[TMP1]], align 4
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) [[TMP19]], align 4
; CHECK-NEXT: store i32 [[TMP20]], ptr addrspace(1) [[TMP2]], align 4
; CHECK-NEXT: ret void
;
bb:
%tmp = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
%tmp1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
%tmp2 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
%tmp4 = load <4 x i32>, ptr addrspace(1) %arg, align 4
%tmp5 = extractelement <4 x i32> %tmp4, i32 0
%tmp6 = sext i32 %tmp5 to i64
%tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
%tmp8 = load i32, ptr addrspace(1) %tmp7, align 4
store i32 %tmp8, ptr addrspace(1) %arg, align 4
%tmp9 = extractelement <4 x i32> %tmp4, i32 1
%tmp10 = sext i32 %tmp9 to i64
%tmp11 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp10
%tmp12 = load i32, ptr addrspace(1) %tmp11, align 4
store i32 %tmp12, ptr addrspace(1) %tmp, align 4
%tmp13 = extractelement <4 x i32> %tmp4, i32 2
%tmp14 = sext i32 %tmp13 to i64
%tmp15 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp14
%tmp16 = load i32, ptr addrspace(1) %tmp15, align 4
store i32 %tmp16, ptr addrspace(1) %tmp1, align 4
%tmp17 = extractelement <4 x i32> %tmp4, i32 3
%tmp18 = sext i32 %tmp17 to i64
%tmp19 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp18
%tmp20 = load i32, ptr addrspace(1) %tmp19, align 4
store i32 %tmp20, ptr addrspace(1) %tmp2, align 4
ret void
}
; GCN-LABEL: {{^}}test_indirect_through_phi:
; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 0
define amdgpu_kernel void @test_indirect_through_phi(ptr addrspace(1) %arg) {
; CHECK-LABEL: define amdgpu_kernel void @test_indirect_through_phi(
; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]]) {
; CHECK-NEXT: [[BB:.*]]:
; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr addrspace(1) [[ARG]], align 8
; CHECK-NEXT: [[LOAD_F:%.*]] = bitcast float [[LOAD]] to i32
; CHECK-NEXT: [[N:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: br label %[[BB1:.*]]
; CHECK: [[BB1]]:
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD_F]], %[[BB]] ], [ [[AND2:%.*]], %[[BB1]] ]
; CHECK-NEXT: [[IND:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[INC2:%.*]], %[[BB1]] ]
; CHECK-NEXT: [[AND1:%.*]] = and i32 [[PHI]], [[N]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i32 [[AND1]]
; CHECK-NEXT: store float [[LOAD]], ptr addrspace(1) [[GEP]], align 4
; CHECK-NEXT: [[INC1:%.*]] = add nsw i32 [[PHI]], 1310720
; CHECK-NEXT: [[AND2]] = and i32 [[INC1]], [[N]]
; CHECK-NEXT: [[INC2]] = add nuw nsw i32 [[IND]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC2]], 1024
; CHECK-NEXT: br i1 [[CMP]], label %[[BB2:.*]], label %[[BB1]]
; CHECK: [[BB2]]:
; CHECK-NEXT: ret void
;
bb:
%load = load float, ptr addrspace(1) %arg, align 8
%load.f = bitcast float %load to i32
%n = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1: ; preds = %bb1, %bb
%phi = phi i32 [ %load.f, %bb ], [ %and2, %bb1 ]
%ind = phi i32 [ 0, %bb ], [ %inc2, %bb1 ]
%and1 = and i32 %phi, %n
%gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %and1
store float %load, ptr addrspace(1) %gep, align 4
%inc1 = add nsw i32 %phi, 1310720
%and2 = and i32 %inc1, %n
%inc2 = add nuw nsw i32 %ind, 1
%cmp = icmp eq i32 %inc2, 1024
br i1 %cmp, label %bb2, label %bb1
bb2: ; preds = %bb1
ret void
}
define void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
; CHECK-LABEL: define void @test_membound_func(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
; CHECK-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = zext i32 %tmp to i64
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
%tmp6 = add nuw nsw i64 %tmp2, 1
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
ret void
}
; GCN-LABEL: {{^}}kernel_call_test_membound_func:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @kernel_call_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_call_test_membound_func(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
; CHECK-NEXT: ret void
;
call void @test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
ret void
}
; TODO: Probably should assume yes?
; GCN-LABEL: {{^}}kernel_indirect_call:
; GCN: MemoryBound: 0
; GCN: WaveLimiterHint : 0
define amdgpu_kernel void @kernel_indirect_call(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_indirect_call(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) {
; CHECK-NEXT: call void [[FPTR]](ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
; CHECK-NEXT: ret void
;
call void %fptr(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
ret void
}
declare void @extern()
define void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
; CHECK-LABEL: define void @maybe_recursive_test_membound_func(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
; CHECK-NEXT: call void @extern()
; CHECK-NEXT: ret void
;
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = zext i32 %tmp to i64
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
%tmp6 = add nuw nsw i64 %tmp2, 1
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
call void @extern()
ret void
}
; GCN-LABEL: {{^}}kernel_call_maybe_recursive_test_membound_func:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_call_maybe_recursive_test_membound_func(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
; CHECK-NEXT: ret void
;
call void @maybe_recursive_test_membound_func(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
ret void
}
define void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
; CHECK-LABEL: define void @mutually_recursive_test_membound_func_0(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP3]], align 16
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP2]]
; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr addrspace(1) [[TMP5]], align 16
; CHECK-NEXT: call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
; CHECK-NEXT: ret void
;
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = zext i32 %tmp to i64
%tmp3 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp2
%tmp4 = load <4 x i32>, ptr addrspace(1) %tmp3, align 16
%tmp5 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp2
store <4 x i32> %tmp4, ptr addrspace(1) %tmp5, align 16
call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
ret void
}
define void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1) {
; CHECK-LABEL: define void @mutually_recursive_test_membound_func_1(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]]) #[[ATTR1]] {
; CHECK-NEXT: [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP7]], align 16
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[ARG1]], i64 [[TMP6]]
; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr addrspace(1) [[TMP9]], align 16
; CHECK-NEXT: call void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
; CHECK-NEXT: ret void
;
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
%tmp2 = zext i32 %tmp to i64
%tmp6 = add nuw nsw i64 %tmp2, 1
%tmp7 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg, i64 %tmp6
%tmp8 = load <4 x i32>, ptr addrspace(1) %tmp7, align 16
%tmp9 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %arg1, i64 %tmp6
store <4 x i32> %tmp8, ptr addrspace(1) %tmp9, align 16
call void @mutually_recursive_test_membound_func_1(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
ret void
}
; GCN-LABEL: {{^}}kernel_call_mutually_recursive_test_membound_func_0:
; GCN: MemoryBound: 1
; GCN: WaveLimiterHint : 1
define amdgpu_kernel void @kernel_call_mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1, ptr %fptr) {
; CHECK-LABEL: define amdgpu_kernel void @kernel_call_mutually_recursive_test_membound_func_0(
; CHECK-SAME: ptr addrspace(1) nocapture readonly [[ARG:%.*]], ptr addrspace(1) nocapture [[ARG1:%.*]], ptr [[FPTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly [[ARG]], ptr addrspace(1) nocapture [[ARG1]])
; CHECK-NEXT: ret void
;
call void @mutually_recursive_test_membound_func_0(ptr addrspace(1) nocapture readonly %arg, ptr addrspace(1) nocapture %arg1)
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-memory-bound"="true" "amdgpu-wave-limiter"="true" }
; CHECK: attributes #[[ATTR1]] = { "amdgpu-memory-bound"="true" }
; CHECK: attributes #[[ATTR2]] = { "amdgpu-wave-limiter"="true" }
; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.