AMDGPU: Improve vector of pointer handling in amdgpu-promote-alloca (#114144)
This commit is contained in:
@@ -1115,9 +1115,10 @@ bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
|
||||
if (Val == OtherOp)
|
||||
OtherOp = Inst->getOperand(OpIdx1);
|
||||
|
||||
if (isa<ConstantPointerNull>(OtherOp))
|
||||
if (isa<ConstantPointerNull, ConstantAggregateZero>(OtherOp))
|
||||
return true;
|
||||
|
||||
// TODO: getUnderlyingObject will not work on a vector getelementptr
|
||||
Value *OtherObj = getUnderlyingObject(OtherOp);
|
||||
if (!isa<AllocaInst>(OtherObj))
|
||||
return false;
|
||||
@@ -1195,36 +1196,19 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
|
||||
continue;
|
||||
}
|
||||
|
||||
// TODO: If we know the address is only observed through flat pointers, we
|
||||
// could still promote.
|
||||
if (UseInst->getOpcode() == Instruction::AddrSpaceCast)
|
||||
return false;
|
||||
|
||||
// Do not promote vector/aggregate type instructions. It is hard to track
|
||||
// their users.
|
||||
if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
|
||||
return false;
|
||||
|
||||
// TODO: Handle vectors of pointers.
|
||||
if (!User->getType()->isPointerTy())
|
||||
return false;
|
||||
|
||||
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
|
||||
// Be conservative if an address could be computed outside the bounds of
|
||||
// the alloca.
|
||||
if (!GEP->isInBounds())
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only promote a select if we know that the other select operand is from
|
||||
// another pointer that will also be promoted.
|
||||
if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
|
||||
} else if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
|
||||
// Only promote a select if we know that the other select operand is from
|
||||
// another pointer that will also be promoted.
|
||||
if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
|
||||
return false;
|
||||
}
|
||||
} else if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
|
||||
// Repeat for phis.
|
||||
|
||||
// Repeat for phis.
|
||||
if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
|
||||
// TODO: Handle more complex cases. We should be able to replace loops
|
||||
// over arrays.
|
||||
switch (Phi->getNumIncomingValues()) {
|
||||
@@ -1237,6 +1221,15 @@ bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
} else if (!isa<ExtractElementInst>(User)) {
|
||||
// Do not promote vector/aggregate type instructions. It is hard to track
|
||||
// their users.
|
||||
|
||||
// Do not promote addrspacecast.
|
||||
//
|
||||
// TODO: If we know the address is only observed through flat pointers, we
|
||||
// could still promote.
|
||||
return false;
|
||||
}
|
||||
|
||||
WorkList.push_back(User);
|
||||
@@ -1490,17 +1483,21 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
|
||||
|
||||
SmallVector<IntrinsicInst *> DeferredIntrs;
|
||||
|
||||
PointerType *NewPtrTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
|
||||
|
||||
for (Value *V : WorkList) {
|
||||
CallInst *Call = dyn_cast<CallInst>(V);
|
||||
if (!Call) {
|
||||
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
|
||||
PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
|
||||
Value *LHS = CI->getOperand(0);
|
||||
Value *RHS = CI->getOperand(1);
|
||||
|
||||
if (isa<ConstantPointerNull>(CI->getOperand(0)))
|
||||
CI->setOperand(0, ConstantPointerNull::get(NewTy));
|
||||
Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
|
||||
if (isa<ConstantPointerNull, ConstantAggregateZero>(LHS))
|
||||
CI->setOperand(0, Constant::getNullValue(NewTy));
|
||||
|
||||
if (isa<ConstantPointerNull>(CI->getOperand(1)))
|
||||
CI->setOperand(1, ConstantPointerNull::get(NewTy));
|
||||
if (isa<ConstantPointerNull, ConstantAggregateZero>(RHS))
|
||||
CI->setOperand(1, Constant::getNullValue(NewTy));
|
||||
|
||||
continue;
|
||||
}
|
||||
@@ -1510,25 +1507,23 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
|
||||
if (isa<AddrSpaceCastInst>(V))
|
||||
continue;
|
||||
|
||||
PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
|
||||
assert(V->getType()->isPtrOrPtrVectorTy());
|
||||
|
||||
assert(isa<PointerType>(V->getType()));
|
||||
|
||||
// FIXME: It doesn't really make sense to try to do this for all
|
||||
// instructions.
|
||||
Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
|
||||
V->mutateType(NewTy);
|
||||
|
||||
// Adjust the types of any constant operands.
|
||||
if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
|
||||
if (isa<ConstantPointerNull>(SI->getOperand(1)))
|
||||
SI->setOperand(1, ConstantPointerNull::get(NewTy));
|
||||
if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(1)))
|
||||
SI->setOperand(1, Constant::getNullValue(NewTy));
|
||||
|
||||
if (isa<ConstantPointerNull>(SI->getOperand(2)))
|
||||
SI->setOperand(2, ConstantPointerNull::get(NewTy));
|
||||
if (isa<ConstantPointerNull, ConstantAggregateZero>(SI->getOperand(2)))
|
||||
SI->setOperand(2, Constant::getNullValue(NewTy));
|
||||
} else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
|
||||
for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
|
||||
if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
|
||||
Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
|
||||
if (isa<ConstantPointerNull, ConstantAggregateZero>(
|
||||
Phi->getIncomingValue(I)))
|
||||
Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,25 +7,40 @@
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset() {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(5)> [[GETELEMENTPTR]], i64 0
|
||||
; CHECK-NEXT: store i32 0, ptr addrspace(5) [[EXTRACTELEMENT]], align 4
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0:![0-9]+]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1:![0-9]+]], !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[GETELEMENTPTR]], i64 0
|
||||
; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%alloca = alloca i32, align 4, addrspace(5)
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
%getelementptr = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
%extractelement = extractelement <4 x ptr addrspace(5)> %getelementptr, i64 0
|
||||
store i32 0, ptr addrspace(5) %extractelement
|
||||
ret void
|
||||
}
|
||||
|
||||
; TODO: Should be able to promote this
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %cond) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(
|
||||
; CHECK-SAME: i1 [[COND:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x i32], align 4, addrspace(5)
|
||||
; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[GETELEMENTPTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[ALLOCA]], <4 x i64> <i64 3, i64 2, i64 1, i64 0>
|
||||
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(5)> [[GETELEMENTPTR0]], <4 x ptr addrspace(5)> [[GETELEMENTPTR1]]
|
||||
@@ -34,7 +49,7 @@ define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select(i1 %c
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%alloca = alloca i32, align 4, addrspace(5)
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
%getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
%getelementptr1 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 3, i64 2, i64 1, i64 0>
|
||||
%select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> %getelementptr1
|
||||
@@ -42,3 +57,187 @@ bb:
|
||||
store i32 0, ptr addrspace(5) %extractelement
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0(i1 %cond) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0(
|
||||
; CHECK-SAME: i1 [[COND:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr0.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]]
|
||||
; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[SELECT]], i64 1
|
||||
; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
%getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
%select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0
|
||||
%extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
|
||||
store i32 0, ptr addrspace(5) %extractelement
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1(i1 %cond) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1(
|
||||
; CHECK-SAME: i1 [[COND:%.*]]) {
|
||||
; CHECK-NEXT: [[BB:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_select_nullptr1.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> [[GETELEMENTPTR0]], <4 x ptr addrspace(3)> zeroinitializer
|
||||
; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[SELECT]], i64 1
|
||||
; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb:
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
%getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
%select = select i1 %cond, <4 x ptr addrspace(5)> %getelementptr0, <4 x ptr addrspace(5)> zeroinitializer
|
||||
%extractelement = extractelement <4 x ptr addrspace(5)> %select, i64 1
|
||||
store i32 0, ptr addrspace(5) %extractelement
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0(i1 %cond, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0(
|
||||
; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB0:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr0.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]]
|
||||
; CHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x ptr addrspace(3)> [[SELECT]], zeroinitializer
|
||||
; CHECK-NEXT: store <4 x i1> [[ICMP]], ptr addrspace(1) [[OUT]], align 1
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb0:
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
%getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
%select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0
|
||||
%icmp = icmp eq <4 x ptr addrspace(5)> %select, zeroinitializer
|
||||
store <4 x i1> %icmp, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr1(i1 %cond, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr1(
|
||||
; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB0:.*:]]
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_icmp_nullptr1.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[COND]], <4 x ptr addrspace(3)> zeroinitializer, <4 x ptr addrspace(3)> [[GETELEMENTPTR0]]
|
||||
; CHECK-NEXT: [[ICMP:%.*]] = icmp eq <4 x ptr addrspace(3)> zeroinitializer, [[SELECT]]
|
||||
; CHECK-NEXT: store <4 x i1> [[ICMP]], ptr addrspace(1) [[OUT]], align 1
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb0:
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
%getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
%select = select i1 %cond, <4 x ptr addrspace(5)> zeroinitializer, <4 x ptr addrspace(5)> %getelementptr0
|
||||
%icmp = icmp eq <4 x ptr addrspace(5)> zeroinitializer, %select
|
||||
store <4 x i1> %icmp, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_phi_nullptr(i1 %cond, ptr addrspace(1) %out) {
|
||||
; CHECK-LABEL: define amdgpu_kernel void @scalar_alloca_ptr_with_vector_gep_offset_phi_nullptr(
|
||||
; CHECK-SAME: i1 [[COND:%.*]], ptr addrspace(1) [[OUT:%.*]]) {
|
||||
; CHECK-NEXT: [[BB0:.*]]:
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = call noalias nonnull dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 1
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TMP1]], align 4, !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr addrspace(4) [[TMP0]], i64 2
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr addrspace(4) [[TMP3]], align 4, !range [[RNG1]], !invariant.load [[META0]]
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = lshr i32 [[TMP2]], 16
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x()
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.y()
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = call range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.z()
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i32 [[TMP5]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], [[TMP6]]
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw i32 [[TMP7]], [[TMP4]]
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP10]], [[TMP11]]
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], [[TMP8]]
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x [4 x i32]], ptr addrspace(3) @scalar_alloca_ptr_with_vector_gep_offset_phi_nullptr.alloca, i32 0, i32 [[TMP13]]
|
||||
; CHECK-NEXT: br i1 [[COND]], label %[[BB1:.*]], label %[[BB2:.*]]
|
||||
; CHECK: [[BB1]]:
|
||||
; CHECK-NEXT: [[GETELEMENTPTR0:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP14]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
; CHECK-NEXT: br label %[[BB2]]
|
||||
; CHECK: [[BB2]]:
|
||||
; CHECK-NEXT: [[PHI:%.*]] = phi <4 x ptr addrspace(3)> [ [[GETELEMENTPTR0]], %[[BB1]] ], [ zeroinitializer, %[[BB0]] ]
|
||||
; CHECK-NEXT: [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(3)> [[PHI]], i64 2
|
||||
; CHECK-NEXT: store i32 0, ptr addrspace(3) [[EXTRACTELEMENT]], align 4
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
bb0:
|
||||
%alloca = alloca [4 x i32], align 4, addrspace(5)
|
||||
br i1 %cond, label %bb1, label %bb2
|
||||
|
||||
bb1:
|
||||
%getelementptr0 = getelementptr inbounds i8, ptr addrspace(5) %alloca, <4 x i64> <i64 0, i64 1, i64 2, i64 3>
|
||||
br label %bb2
|
||||
|
||||
bb2:
|
||||
%phi = phi <4 x ptr addrspace(5)> [ %getelementptr0, %bb1 ], [ zeroinitializer, %bb0]
|
||||
%extractelement = extractelement <4 x ptr addrspace(5)> %phi, i64 2
|
||||
store i32 0, ptr addrspace(5) %extractelement
|
||||
ret void
|
||||
}
|
||||
;.
|
||||
; CHECK: [[META0]] = !{}
|
||||
; CHECK: [[RNG1]] = !{i32 0, i32 1025}
|
||||
;.
|
||||
|
||||
Reference in New Issue
Block a user