//===- ScopBuilder.cpp ----------------------------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // Create a polyhedral description for a static control flow region. // // The pass creates a polyhedral description of the Scops detected by the SCoP // detection derived from their LLVM-IR code. // //===----------------------------------------------------------------------===// #include "polly/ScopBuilder.h" #include "polly/Options.h" #include "polly/ScopDetection.h" #include "polly/ScopInfo.h" #include "polly/Support/GICHelper.h" #include "polly/Support/ISLTools.h" #include "polly/Support/SCEVValidator.h" #include "polly/Support/ScopHelper.h" #include "polly/Support/VirtualInstruction.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/EquivalenceClasses.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/RegionInfo.h" #include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/Value.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include using namespace llvm; using namespace polly; #define DEBUG_TYPE "polly-scops" STATISTIC(ScopFound, "Number of valid Scops"); STATISTIC(RichScopFound, "Number of Scops containing a loop"); STATISTIC(InfeasibleScops, "Number of SCoPs with statically infeasible context."); bool polly::ModelReadOnlyScalars; // The maximal number of dimensions we allow during invariant load construction. // More complex access ranges will result in very high compile time and are also // unlikely to result in good code. This value is very high and should only // trigger for corner cases (e.g., the "dct_luma" function in h264, SPEC2006). static int const MaxDimensionsInAccessRange = 9; static cl::opt XModelReadOnlyScalars( "polly-analyze-read-only-scalars", cl::desc("Model read-only scalar values in the scop description"), cl::location(ModelReadOnlyScalars), cl::Hidden, cl::ZeroOrMore, cl::init(true), cl::cat(PollyCategory)); static cl::opt OptComputeOut("polly-analysis-computeout", cl::desc("Bound the scop analysis by a maximal amount of " "computational steps (0 means no bound)"), cl::Hidden, cl::init(800000), cl::ZeroOrMore, cl::cat(PollyCategory)); static cl::opt PollyAllowDereferenceOfAllFunctionParams( "polly-allow-dereference-of-all-function-parameters", cl::desc( "Treat all parameters to functions that are pointers as dereferencible." " This is useful for invariant load hoisting, since we can generate" " less runtime checks. This is only valid if all pointers to functions" " are always initialized, so that Polly can choose to hoist" " their loads. "), cl::Hidden, cl::init(false), cl::cat(PollyCategory)); static cl::opt RunTimeChecksMaxArraysPerGroup( "polly-rtc-max-arrays-per-group", cl::desc("The maximal number of arrays to compare in each alias group."), cl::Hidden, cl::ZeroOrMore, cl::init(20), cl::cat(PollyCategory)); static cl::opt RunTimeChecksMaxAccessDisjuncts( "polly-rtc-max-array-disjuncts", cl::desc("The maximal number of disjunts allowed in memory accesses to " "to build RTCs."), cl::Hidden, cl::ZeroOrMore, cl::init(8), cl::cat(PollyCategory)); static cl::opt RunTimeChecksMaxParameters( "polly-rtc-max-parameters", cl::desc("The maximal number of parameters allowed in RTCs."), cl::Hidden, cl::ZeroOrMore, cl::init(8), cl::cat(PollyCategory)); static cl::opt UnprofitableScalarAccs( "polly-unprofitable-scalar-accs", cl::desc("Count statements with scalar accesses as not optimizable"), cl::Hidden, cl::init(false), cl::cat(PollyCategory)); static cl::opt UserContextStr( "polly-context", cl::value_desc("isl parameter set"), cl::desc("Provide additional constraints on the context parameters"), cl::init(""), cl::cat(PollyCategory)); static cl::opt DetectFortranArrays( "polly-detect-fortran-arrays", cl::desc("Detect Fortran arrays and use this for code generation"), cl::Hidden, cl::init(false), cl::cat(PollyCategory)); static cl::opt DetectReductions("polly-detect-reductions", cl::desc("Detect and exploit reductions"), cl::Hidden, cl::ZeroOrMore, cl::init(true), cl::cat(PollyCategory)); // Multiplicative reductions can be disabled separately as these kind of // operations can overflow easily. Additive reductions and bit operations // are in contrast pretty stable. static cl::opt DisableMultiplicativeReductions( "polly-disable-multiplicative-reductions", cl::desc("Disable multiplicative reductions"), cl::Hidden, cl::ZeroOrMore, cl::init(false), cl::cat(PollyCategory)); enum class GranularityChoice { BasicBlocks, ScalarIndependence, Stores }; static cl::opt StmtGranularity( "polly-stmt-granularity", cl::desc( "Algorithm to use for splitting basic blocks into multiple statements"), cl::values(clEnumValN(GranularityChoice::BasicBlocks, "bb", "One statement per basic block"), clEnumValN(GranularityChoice::ScalarIndependence, "scalar-indep", "Scalar independence heuristic"), clEnumValN(GranularityChoice::Stores, "store", "Store-level granularity")), cl::init(GranularityChoice::ScalarIndependence), cl::cat(PollyCategory)); void ScopBuilder::buildInvariantEquivalenceClasses() { DenseMap, LoadInst *> EquivClasses; const InvariantLoadsSetTy &RIL = scop->getRequiredInvariantLoads(); for (LoadInst *LInst : RIL) { const SCEV *PointerSCEV = SE.getSCEV(LInst->getPointerOperand()); Type *Ty = LInst->getType(); LoadInst *&ClassRep = EquivClasses[std::make_pair(PointerSCEV, Ty)]; if (ClassRep) { scop->addInvariantLoadMapping(LInst, ClassRep); continue; } ClassRep = LInst; scop->addInvariantEquivClass( InvariantEquivClassTy{PointerSCEV, MemoryAccessList(), nullptr, Ty}); } } void ScopBuilder::buildPHIAccesses(ScopStmt *PHIStmt, PHINode *PHI, Region *NonAffineSubRegion, bool IsExitBlock) { // PHI nodes that are in the exit block of the region, hence if IsExitBlock is // true, are not modeled as ordinary PHI nodes as they are not part of the // region. However, we model the operands in the predecessor blocks that are // part of the region as regular scalar accesses. // If we can synthesize a PHI we can skip it, however only if it is in // the region. If it is not it can only be in the exit block of the region. // In this case we model the operands but not the PHI itself. auto *Scope = LI.getLoopFor(PHI->getParent()); if (!IsExitBlock && canSynthesize(PHI, *scop, &SE, Scope)) return; // PHI nodes are modeled as if they had been demoted prior to the SCoP // detection. Hence, the PHI is a load of a new memory location in which the // incoming value was written at the end of the incoming basic block. bool OnlyNonAffineSubRegionOperands = true; for (unsigned u = 0; u < PHI->getNumIncomingValues(); u++) { Value *Op = PHI->getIncomingValue(u); BasicBlock *OpBB = PHI->getIncomingBlock(u); ScopStmt *OpStmt = scop->getIncomingStmtFor(PHI->getOperandUse(u)); // Do not build PHI dependences inside a non-affine subregion, but make // sure that the necessary scalar values are still made available. if (NonAffineSubRegion && NonAffineSubRegion->contains(OpBB)) { auto *OpInst = dyn_cast(Op); if (!OpInst || !NonAffineSubRegion->contains(OpInst)) ensureValueRead(Op, OpStmt); continue; } OnlyNonAffineSubRegionOperands = false; ensurePHIWrite(PHI, OpStmt, OpBB, Op, IsExitBlock); } if (!OnlyNonAffineSubRegionOperands && !IsExitBlock) { addPHIReadAccess(PHIStmt, PHI); } } void ScopBuilder::buildScalarDependences(ScopStmt *UserStmt, Instruction *Inst) { assert(!isa(Inst)); // Pull-in required operands. for (Use &Op : Inst->operands()) ensureValueRead(Op.get(), UserStmt); } // Create a sequence of two schedules. Either argument may be null and is // interpreted as the empty schedule. Can also return null if both schedules are // empty. static isl::schedule combineInSequence(isl::schedule Prev, isl::schedule Succ) { if (!Prev) return Succ; if (!Succ) return Prev; return Prev.sequence(Succ); } // Create an isl_multi_union_aff that defines an identity mapping from the // elements of USet to their N-th dimension. // // # Example: // // Domain: { A[i,j]; B[i,j,k] } // N: 1 // // Resulting Mapping: { {A[i,j] -> [(j)]; B[i,j,k] -> [(j)] } // // @param USet A union set describing the elements for which to generate a // mapping. // @param N The dimension to map to. // @returns A mapping from USet to its N-th dimension. static isl::multi_union_pw_aff mapToDimension(isl::union_set USet, int N) { assert(N >= 0); assert(USet); assert(!USet.is_empty()); auto Result = isl::union_pw_multi_aff::empty(USet.get_space()); for (isl::set S : USet.get_set_list()) { int Dim = S.dim(isl::dim::set); auto PMA = isl::pw_multi_aff::project_out_map(S.get_space(), isl::dim::set, N, Dim - N); if (N > 1) PMA = PMA.drop_dims(isl::dim::out, 0, N - 1); Result = Result.add_pw_multi_aff(PMA); } return isl::multi_union_pw_aff(isl::union_pw_multi_aff(Result)); } void ScopBuilder::buildSchedule() { Loop *L = getLoopSurroundingScop(*scop, LI); LoopStackTy LoopStack({LoopStackElementTy(L, nullptr, 0)}); buildSchedule(scop->getRegion().getNode(), LoopStack); assert(LoopStack.size() == 1 && LoopStack.back().L == L); scop->setScheduleTree(LoopStack[0].Schedule); } /// To generate a schedule for the elements in a Region we traverse the Region /// in reverse-post-order and add the contained RegionNodes in traversal order /// to the schedule of the loop that is currently at the top of the LoopStack. /// For loop-free codes, this results in a correct sequential ordering. /// /// Example: /// bb1(0) /// / \. /// bb2(1) bb3(2) /// \ / \. /// bb4(3) bb5(4) /// \ / /// bb6(5) /// /// Including loops requires additional processing. Whenever a loop header is /// encountered, the corresponding loop is added to the @p LoopStack. Starting /// from an empty schedule, we first process all RegionNodes that are within /// this loop and complete the sequential schedule at this loop-level before /// processing about any other nodes. To implement this /// loop-nodes-first-processing, the reverse post-order traversal is /// insufficient. Hence, we additionally check if the traversal yields /// sub-regions or blocks that are outside the last loop on the @p LoopStack. /// These region-nodes are then queue and only traverse after the all nodes /// within the current loop have been processed. void ScopBuilder::buildSchedule(Region *R, LoopStackTy &LoopStack) { Loop *OuterScopLoop = getLoopSurroundingScop(*scop, LI); ReversePostOrderTraversal RTraversal(R); std::deque WorkList(RTraversal.begin(), RTraversal.end()); std::deque DelayList; bool LastRNWaiting = false; // Iterate over the region @p R in reverse post-order but queue // sub-regions/blocks iff they are not part of the last encountered but not // completely traversed loop. The variable LastRNWaiting is a flag to indicate // that we queued the last sub-region/block from the reverse post-order // iterator. If it is set we have to explore the next sub-region/block from // the iterator (if any) to guarantee progress. If it is not set we first try // the next queued sub-region/blocks. while (!WorkList.empty() || !DelayList.empty()) { RegionNode *RN; if ((LastRNWaiting && !WorkList.empty()) || DelayList.empty()) { RN = WorkList.front(); WorkList.pop_front(); LastRNWaiting = false; } else { RN = DelayList.front(); DelayList.pop_front(); } Loop *L = getRegionNodeLoop(RN, LI); if (!scop->contains(L)) L = OuterScopLoop; Loop *LastLoop = LoopStack.back().L; if (LastLoop != L) { if (LastLoop && !LastLoop->contains(L)) { LastRNWaiting = true; DelayList.push_back(RN); continue; } LoopStack.push_back({L, nullptr, 0}); } buildSchedule(RN, LoopStack); } } void ScopBuilder::buildSchedule(RegionNode *RN, LoopStackTy &LoopStack) { if (RN->isSubRegion()) { auto *LocalRegion = RN->getNodeAs(); if (!scop->isNonAffineSubRegion(LocalRegion)) { buildSchedule(LocalRegion, LoopStack); return; } } assert(LoopStack.rbegin() != LoopStack.rend()); auto LoopData = LoopStack.rbegin(); LoopData->NumBlocksProcessed += getNumBlocksInRegionNode(RN); for (auto *Stmt : scop->getStmtListFor(RN)) { isl::union_set UDomain{Stmt->getDomain()}; auto StmtSchedule = isl::schedule::from_domain(UDomain); LoopData->Schedule = combineInSequence(LoopData->Schedule, StmtSchedule); } // Check if we just processed the last node in this loop. If we did, finalize // the loop by: // // - adding new schedule dimensions // - folding the resulting schedule into the parent loop schedule // - dropping the loop schedule from the LoopStack. // // Then continue to check surrounding loops, which might also have been // completed by this node. size_t Dimension = LoopStack.size(); while (LoopData->L && LoopData->NumBlocksProcessed == getNumBlocksInLoop(LoopData->L)) { isl::schedule Schedule = LoopData->Schedule; auto NumBlocksProcessed = LoopData->NumBlocksProcessed; assert(std::next(LoopData) != LoopStack.rend()); ++LoopData; --Dimension; if (Schedule) { isl::union_set Domain = Schedule.get_domain(); isl::multi_union_pw_aff MUPA = mapToDimension(Domain, Dimension); Schedule = Schedule.insert_partial_schedule(MUPA); LoopData->Schedule = combineInSequence(LoopData->Schedule, Schedule); } LoopData->NumBlocksProcessed += NumBlocksProcessed; } // Now pop all loops processed up there from the LoopStack LoopStack.erase(LoopStack.begin() + Dimension, LoopStack.end()); } void ScopBuilder::buildEscapingDependences(Instruction *Inst) { // Check for uses of this instruction outside the scop. Because we do not // iterate over such instructions and therefore did not "ensure" the existence // of a write, we must determine such use here. if (scop->isEscaping(Inst)) ensureValueWrite(Inst); } /// Check that a value is a Fortran Array descriptor. /// /// We check if V has the following structure: /// %"struct.array1_real(kind=8)" = type { i8*, i, i, /// [ x %struct.descriptor_dimension] } /// /// /// %struct.descriptor_dimension = type { i, i, i } /// /// 1. V's type name starts with "struct.array" /// 2. V's type has layout as shown. /// 3. Final member of V's type has name "struct.descriptor_dimension", /// 4. "struct.descriptor_dimension" has layout as shown. /// 5. Consistent use of i where is some fixed integer number. /// /// We are interested in such types since this is the code that dragonegg /// generates for Fortran array descriptors. /// /// @param V the Value to be checked. /// /// @returns True if V is a Fortran array descriptor, False otherwise. bool isFortranArrayDescriptor(Value *V) { PointerType *PTy = dyn_cast(V->getType()); if (!PTy) return false; Type *Ty = PTy->getElementType(); assert(Ty && "Ty expected to be initialized"); auto *StructArrTy = dyn_cast(Ty); if (!(StructArrTy && StructArrTy->hasName())) return false; if (!StructArrTy->getName().startswith("struct.array")) return false; if (StructArrTy->getNumElements() != 4) return false; const ArrayRef ArrMemberTys = StructArrTy->elements(); // i8* match if (ArrMemberTys[0] != Type::getInt8PtrTy(V->getContext())) return false; // Get a reference to the int type and check that all the members // share the same int type Type *IntTy = ArrMemberTys[1]; if (ArrMemberTys[2] != IntTy) return false; // type: [ x %struct.descriptor_dimension] ArrayType *DescriptorDimArrayTy = dyn_cast(ArrMemberTys[3]); if (!DescriptorDimArrayTy) return false; // type: %struct.descriptor_dimension := type { ixx, ixx, ixx } StructType *DescriptorDimTy = dyn_cast(DescriptorDimArrayTy->getElementType()); if (!(DescriptorDimTy && DescriptorDimTy->hasName())) return false; if (DescriptorDimTy->getName() != "struct.descriptor_dimension") return false; if (DescriptorDimTy->getNumElements() != 3) return false; for (auto MemberTy : DescriptorDimTy->elements()) { if (MemberTy != IntTy) return false; } return true; } Value *ScopBuilder::findFADAllocationVisible(MemAccInst Inst) { // match: 4.1 & 4.2 store/load if (!isa(Inst) && !isa(Inst)) return nullptr; // match: 4 if (Inst.getAlignment() != 8) return nullptr; Value *Address = Inst.getPointerOperand(); const BitCastInst *Bitcast = nullptr; // [match: 3] if (auto *Slot = dyn_cast(Address)) { Value *TypedMem = Slot->getPointerOperand(); // match: 2 Bitcast = dyn_cast(TypedMem); } else { // match: 2 Bitcast = dyn_cast(Address); } if (!Bitcast) return nullptr; auto *MallocMem = Bitcast->getOperand(0); // match: 1 auto *MallocCall = dyn_cast(MallocMem); if (!MallocCall) return nullptr; Function *MallocFn = MallocCall->getCalledFunction(); if (!(MallocFn && MallocFn->hasName() && MallocFn->getName() == "malloc")) return nullptr; // Find all uses the malloc'd memory. // We are looking for a "store" into a struct with the type being the Fortran // descriptor type for (auto user : MallocMem->users()) { /// match: 5 auto *MallocStore = dyn_cast(user); if (!MallocStore) continue; auto *DescriptorGEP = dyn_cast(MallocStore->getPointerOperand()); if (!DescriptorGEP) continue; // match: 5 auto DescriptorType = dyn_cast(DescriptorGEP->getSourceElementType()); if (!(DescriptorType && DescriptorType->hasName())) continue; Value *Descriptor = dyn_cast(DescriptorGEP->getPointerOperand()); if (!Descriptor) continue; if (!isFortranArrayDescriptor(Descriptor)) continue; return Descriptor; } return nullptr; } Value *ScopBuilder::findFADAllocationInvisible(MemAccInst Inst) { // match: 3 if (!isa(Inst) && !isa(Inst)) return nullptr; Value *Slot = Inst.getPointerOperand(); LoadInst *MemLoad = nullptr; // [match: 2] if (auto *SlotGEP = dyn_cast(Slot)) { // match: 1 MemLoad = dyn_cast(SlotGEP->getPointerOperand()); } else { // match: 1 MemLoad = dyn_cast(Slot); } if (!MemLoad) return nullptr; auto *BitcastOperator = dyn_cast(MemLoad->getPointerOperand()); if (!BitcastOperator) return nullptr; Value *Descriptor = dyn_cast(BitcastOperator->getOperand(0)); if (!Descriptor) return nullptr; if (!isFortranArrayDescriptor(Descriptor)) return nullptr; return Descriptor; } void ScopBuilder::addRecordedAssumptions() { for (auto &AS : llvm::reverse(scop->recorded_assumptions())) { if (!AS.BB) { scop->addAssumption(AS.Kind, AS.Set, AS.Loc, AS.Sign, nullptr /* BasicBlock */); continue; } // If the domain was deleted the assumptions are void. isl_set *Dom = scop->getDomainConditions(AS.BB).release(); if (!Dom) continue; // If a basic block was given use its domain to simplify the assumption. // In case of restrictions we know they only have to hold on the domain, // thus we can intersect them with the domain of the block. However, for // assumptions the domain has to imply them, thus: // _ _____ // Dom => S <==> A v B <==> A - B // // To avoid the complement we will register A - B as a restriction not an // assumption. isl_set *S = AS.Set.copy(); if (AS.Sign == AS_RESTRICTION) S = isl_set_params(isl_set_intersect(S, Dom)); else /* (AS.Sign == AS_ASSUMPTION) */ S = isl_set_params(isl_set_subtract(Dom, S)); scop->addAssumption(AS.Kind, isl::manage(S), AS.Loc, AS_RESTRICTION, AS.BB); } scop->clearRecordedAssumptions(); } bool ScopBuilder::buildAccessMultiDimFixed(MemAccInst Inst, ScopStmt *Stmt) { Value *Val = Inst.getValueOperand(); Type *ElementType = Val->getType(); Value *Address = Inst.getPointerOperand(); const SCEV *AccessFunction = SE.getSCEVAtScope(Address, LI.getLoopFor(Inst->getParent())); const SCEVUnknown *BasePointer = dyn_cast(SE.getPointerBase(AccessFunction)); enum MemoryAccess::AccessType AccType = isa(Inst) ? MemoryAccess::READ : MemoryAccess::MUST_WRITE; if (auto *BitCast = dyn_cast(Address)) { auto *Src = BitCast->getOperand(0); auto *SrcTy = Src->getType(); auto *DstTy = BitCast->getType(); // Do not try to delinearize non-sized (opaque) pointers. if ((SrcTy->isPointerTy() && !SrcTy->getPointerElementType()->isSized()) || (DstTy->isPointerTy() && !DstTy->getPointerElementType()->isSized())) { return false; } if (SrcTy->isPointerTy() && DstTy->isPointerTy() && DL.getTypeAllocSize(SrcTy->getPointerElementType()) == DL.getTypeAllocSize(DstTy->getPointerElementType())) Address = Src; } auto *GEP = dyn_cast(Address); if (!GEP) return false; std::vector Subscripts; std::vector Sizes; std::tie(Subscripts, Sizes) = getIndexExpressionsFromGEP(GEP, SE); auto *BasePtr = GEP->getOperand(0); if (auto *BasePtrCast = dyn_cast(BasePtr)) BasePtr = BasePtrCast->getOperand(0); // Check for identical base pointers to ensure that we do not miss index // offsets that have been added before this GEP is applied. if (BasePtr != BasePointer->getValue()) return false; std::vector SizesSCEV; const InvariantLoadsSetTy &ScopRIL = scop->getRequiredInvariantLoads(); Loop *SurroundingLoop = Stmt->getSurroundingLoop(); for (auto *Subscript : Subscripts) { InvariantLoadsSetTy AccessILS; if (!isAffineExpr(&scop->getRegion(), SurroundingLoop, Subscript, SE, &AccessILS)) return false; for (LoadInst *LInst : AccessILS) if (!ScopRIL.count(LInst)) return false; } if (Sizes.empty()) return false; SizesSCEV.push_back(nullptr); for (auto V : Sizes) SizesSCEV.push_back(SE.getSCEV( ConstantInt::get(IntegerType::getInt64Ty(BasePtr->getContext()), V))); addArrayAccess(Stmt, Inst, AccType, BasePointer->getValue(), ElementType, true, Subscripts, SizesSCEV, Val); return true; } bool ScopBuilder::buildAccessMultiDimParam(MemAccInst Inst, ScopStmt *Stmt) { if (!PollyDelinearize) return false; Value *Address = Inst.getPointerOperand(); Value *Val = Inst.getValueOperand(); Type *ElementType = Val->getType(); unsigned ElementSize = DL.getTypeAllocSize(ElementType); enum MemoryAccess::AccessType AccType = isa(Inst) ? MemoryAccess::READ : MemoryAccess::MUST_WRITE; const SCEV *AccessFunction = SE.getSCEVAtScope(Address, LI.getLoopFor(Inst->getParent())); const SCEVUnknown *BasePointer = dyn_cast(SE.getPointerBase(AccessFunction)); assert(BasePointer && "Could not find base pointer"); auto &InsnToMemAcc = scop->getInsnToMemAccMap(); auto AccItr = InsnToMemAcc.find(Inst); if (AccItr == InsnToMemAcc.end()) return false; std::vector Sizes = {nullptr}; Sizes.insert(Sizes.end(), AccItr->second.Shape->DelinearizedSizes.begin(), AccItr->second.Shape->DelinearizedSizes.end()); // In case only the element size is contained in the 'Sizes' array, the // access does not access a real multi-dimensional array. Hence, we allow // the normal single-dimensional access construction to handle this. if (Sizes.size() == 1) return false; // Remove the element size. This information is already provided by the // ElementSize parameter. In case the element size of this access and the // element size used for delinearization differs the delinearization is // incorrect. Hence, we invalidate the scop. // // TODO: Handle delinearization with differing element sizes. auto DelinearizedSize = cast(Sizes.back())->getAPInt().getSExtValue(); Sizes.pop_back(); if (ElementSize != DelinearizedSize) scop->invalidate(DELINEARIZATION, Inst->getDebugLoc(), Inst->getParent()); addArrayAccess(Stmt, Inst, AccType, BasePointer->getValue(), ElementType, true, AccItr->second.DelinearizedSubscripts, Sizes, Val); return true; } bool ScopBuilder::buildAccessMemIntrinsic(MemAccInst Inst, ScopStmt *Stmt) { auto *MemIntr = dyn_cast_or_null(Inst); if (MemIntr == nullptr) return false; auto *L = LI.getLoopFor(Inst->getParent()); auto *LengthVal = SE.getSCEVAtScope(MemIntr->getLength(), L); assert(LengthVal); // Check if the length val is actually affine or if we overapproximate it InvariantLoadsSetTy AccessILS; const InvariantLoadsSetTy &ScopRIL = scop->getRequiredInvariantLoads(); Loop *SurroundingLoop = Stmt->getSurroundingLoop(); bool LengthIsAffine = isAffineExpr(&scop->getRegion(), SurroundingLoop, LengthVal, SE, &AccessILS); for (LoadInst *LInst : AccessILS) if (!ScopRIL.count(LInst)) LengthIsAffine = false; if (!LengthIsAffine) LengthVal = nullptr; auto *DestPtrVal = MemIntr->getDest(); assert(DestPtrVal); auto *DestAccFunc = SE.getSCEVAtScope(DestPtrVal, L); assert(DestAccFunc); // Ignore accesses to "NULL". // TODO: We could use this to optimize the region further, e.g., intersect // the context with // isl_set_complement(isl_set_params(getDomain())) // as we know it would be undefined to execute this instruction anyway. if (DestAccFunc->isZero()) return true; auto *DestPtrSCEV = dyn_cast(SE.getPointerBase(DestAccFunc)); assert(DestPtrSCEV); DestAccFunc = SE.getMinusSCEV(DestAccFunc, DestPtrSCEV); addArrayAccess(Stmt, Inst, MemoryAccess::MUST_WRITE, DestPtrSCEV->getValue(), IntegerType::getInt8Ty(DestPtrVal->getContext()), LengthIsAffine, {DestAccFunc, LengthVal}, {nullptr}, Inst.getValueOperand()); auto *MemTrans = dyn_cast(MemIntr); if (!MemTrans) return true; auto *SrcPtrVal = MemTrans->getSource(); assert(SrcPtrVal); auto *SrcAccFunc = SE.getSCEVAtScope(SrcPtrVal, L); assert(SrcAccFunc); // Ignore accesses to "NULL". // TODO: See above TODO if (SrcAccFunc->isZero()) return true; auto *SrcPtrSCEV = dyn_cast(SE.getPointerBase(SrcAccFunc)); assert(SrcPtrSCEV); SrcAccFunc = SE.getMinusSCEV(SrcAccFunc, SrcPtrSCEV); addArrayAccess(Stmt, Inst, MemoryAccess::READ, SrcPtrSCEV->getValue(), IntegerType::getInt8Ty(SrcPtrVal->getContext()), LengthIsAffine, {SrcAccFunc, LengthVal}, {nullptr}, Inst.getValueOperand()); return true; } bool ScopBuilder::buildAccessCallInst(MemAccInst Inst, ScopStmt *Stmt) { auto *CI = dyn_cast_or_null(Inst); if (CI == nullptr) return false; if (CI->doesNotAccessMemory() || isIgnoredIntrinsic(CI) || isDebugCall(CI)) return true; bool ReadOnly = false; auto *AF = SE.getConstant(IntegerType::getInt64Ty(CI->getContext()), 0); auto *CalledFunction = CI->getCalledFunction(); switch (AA.getModRefBehavior(CalledFunction)) { case FMRB_UnknownModRefBehavior: llvm_unreachable("Unknown mod ref behaviour cannot be represented."); case FMRB_DoesNotAccessMemory: return true; case FMRB_DoesNotReadMemory: case FMRB_OnlyAccessesInaccessibleMem: case FMRB_OnlyAccessesInaccessibleOrArgMem: return false; case FMRB_OnlyReadsMemory: GlobalReads.emplace_back(Stmt, CI); return true; case FMRB_OnlyReadsArgumentPointees: ReadOnly = true; LLVM_FALLTHROUGH; case FMRB_OnlyAccessesArgumentPointees: { auto AccType = ReadOnly ? MemoryAccess::READ : MemoryAccess::MAY_WRITE; Loop *L = LI.getLoopFor(Inst->getParent()); for (const auto &Arg : CI->arg_operands()) { if (!Arg->getType()->isPointerTy()) continue; auto *ArgSCEV = SE.getSCEVAtScope(Arg, L); if (ArgSCEV->isZero()) continue; auto *ArgBasePtr = cast(SE.getPointerBase(ArgSCEV)); addArrayAccess(Stmt, Inst, AccType, ArgBasePtr->getValue(), ArgBasePtr->getType(), false, {AF}, {nullptr}, CI); } return true; } } return true; } void ScopBuilder::buildAccessSingleDim(MemAccInst Inst, ScopStmt *Stmt) { Value *Address = Inst.getPointerOperand(); Value *Val = Inst.getValueOperand(); Type *ElementType = Val->getType(); enum MemoryAccess::AccessType AccType = isa(Inst) ? MemoryAccess::READ : MemoryAccess::MUST_WRITE; const SCEV *AccessFunction = SE.getSCEVAtScope(Address, LI.getLoopFor(Inst->getParent())); const SCEVUnknown *BasePointer = dyn_cast(SE.getPointerBase(AccessFunction)); assert(BasePointer && "Could not find base pointer"); AccessFunction = SE.getMinusSCEV(AccessFunction, BasePointer); // Check if the access depends on a loop contained in a non-affine subregion. bool isVariantInNonAffineLoop = false; SetVector Loops; findLoops(AccessFunction, Loops); for (const Loop *L : Loops) if (Stmt->contains(L)) { isVariantInNonAffineLoop = true; break; } InvariantLoadsSetTy AccessILS; Loop *SurroundingLoop = Stmt->getSurroundingLoop(); bool IsAffine = !isVariantInNonAffineLoop && isAffineExpr(&scop->getRegion(), SurroundingLoop, AccessFunction, SE, &AccessILS); const InvariantLoadsSetTy &ScopRIL = scop->getRequiredInvariantLoads(); for (LoadInst *LInst : AccessILS) if (!ScopRIL.count(LInst)) IsAffine = false; if (!IsAffine && AccType == MemoryAccess::MUST_WRITE) AccType = MemoryAccess::MAY_WRITE; addArrayAccess(Stmt, Inst, AccType, BasePointer->getValue(), ElementType, IsAffine, {AccessFunction}, {nullptr}, Val); } void ScopBuilder::buildMemoryAccess(MemAccInst Inst, ScopStmt *Stmt) { if (buildAccessMemIntrinsic(Inst, Stmt)) return; if (buildAccessCallInst(Inst, Stmt)) return; if (buildAccessMultiDimFixed(Inst, Stmt)) return; if (buildAccessMultiDimParam(Inst, Stmt)) return; buildAccessSingleDim(Inst, Stmt); } void ScopBuilder::buildAccessFunctions() { for (auto &Stmt : *scop) { if (Stmt.isBlockStmt()) { buildAccessFunctions(&Stmt, *Stmt.getBasicBlock()); continue; } Region *R = Stmt.getRegion(); for (BasicBlock *BB : R->blocks()) buildAccessFunctions(&Stmt, *BB, R); } // Build write accesses for values that are used after the SCoP. // The instructions defining them might be synthesizable and therefore not // contained in any statement, hence we iterate over the original instructions // to identify all escaping values. for (BasicBlock *BB : scop->getRegion().blocks()) { for (Instruction &Inst : *BB) buildEscapingDependences(&Inst); } } bool ScopBuilder::shouldModelInst(Instruction *Inst, Loop *L) { return !Inst->isTerminator() && !isIgnoredIntrinsic(Inst) && !canSynthesize(Inst, *scop, &SE, L); } /// Generate a name for a statement. /// /// @param BB The basic block the statement will represent. /// @param BBIdx The index of the @p BB relative to other BBs/regions. /// @param Count The index of the created statement in @p BB. /// @param IsMain Whether this is the main of all statement for @p BB. If true, /// no suffix will be added. /// @param IsLast Uses a special indicator for the last statement of a BB. static std::string makeStmtName(BasicBlock *BB, long BBIdx, int Count, bool IsMain, bool IsLast = false) { std::string Suffix; if (!IsMain) { if (UseInstructionNames) Suffix = '_'; if (IsLast) Suffix += "last"; else if (Count < 26) Suffix += 'a' + Count; else Suffix += std::to_string(Count); } return getIslCompatibleName("Stmt", BB, BBIdx, Suffix, UseInstructionNames); } /// Generate a name for a statement that represents a non-affine subregion. /// /// @param R The region the statement will represent. /// @param RIdx The index of the @p R relative to other BBs/regions. static std::string makeStmtName(Region *R, long RIdx) { return getIslCompatibleName("Stmt", R->getNameStr(), RIdx, "", UseInstructionNames); } void ScopBuilder::buildSequentialBlockStmts(BasicBlock *BB, bool SplitOnStore) { Loop *SurroundingLoop = LI.getLoopFor(BB); int Count = 0; long BBIdx = scop->getNextStmtIdx(); std::vector Instructions; for (Instruction &Inst : *BB) { if (shouldModelInst(&Inst, SurroundingLoop)) Instructions.push_back(&Inst); if (Inst.getMetadata("polly_split_after") || (SplitOnStore && isa(Inst))) { std::string Name = makeStmtName(BB, BBIdx, Count, Count == 0); scop->addScopStmt(BB, Name, SurroundingLoop, Instructions); Count++; Instructions.clear(); } } std::string Name = makeStmtName(BB, BBIdx, Count, Count == 0); scop->addScopStmt(BB, Name, SurroundingLoop, Instructions); } /// Is @p Inst an ordered instruction? /// /// An unordered instruction is an instruction, such that a sequence of /// unordered instructions can be permuted without changing semantics. Any /// instruction for which this is not always the case is ordered. static bool isOrderedInstruction(Instruction *Inst) { return Inst->mayHaveSideEffects() || Inst->mayReadOrWriteMemory(); } /// Join instructions to the same statement if one uses the scalar result of the /// other. static void joinOperandTree(EquivalenceClasses &UnionFind, ArrayRef ModeledInsts) { for (Instruction *Inst : ModeledInsts) { if (isa(Inst)) continue; for (Use &Op : Inst->operands()) { Instruction *OpInst = dyn_cast(Op.get()); if (!OpInst) continue; // Check if OpInst is in the BB and is a modeled instruction. auto OpVal = UnionFind.findValue(OpInst); if (OpVal == UnionFind.end()) continue; UnionFind.unionSets(Inst, OpInst); } } } /// Ensure that the order of ordered instructions does not change. /// /// If we encounter an ordered instruction enclosed in instructions belonging to /// a different statement (which might as well contain ordered instructions, but /// this is not tested here), join them. static void joinOrderedInstructions(EquivalenceClasses &UnionFind, ArrayRef ModeledInsts) { SetVector SeenLeaders; for (Instruction *Inst : ModeledInsts) { if (!isOrderedInstruction(Inst)) continue; Instruction *Leader = UnionFind.getLeaderValue(Inst); bool Inserted = SeenLeaders.insert(Leader); if (Inserted) continue; // Merge statements to close holes. Say, we have already seen statements A // and B, in this order. Then we see an instruction of A again and we would // see the pattern "A B A". This function joins all statements until the // only seen occurrence of A. for (Instruction *Prev : reverse(SeenLeaders)) { // Items added to 'SeenLeaders' are leaders, but may have lost their // leadership status when merged into another statement. Instruction *PrevLeader = UnionFind.getLeaderValue(SeenLeaders.back()); if (PrevLeader == Leader) break; UnionFind.unionSets(Prev, Leader); } } } /// If the BasicBlock has an edge from itself, ensure that the PHI WRITEs for /// the incoming values from this block are executed after the PHI READ. /// /// Otherwise it could overwrite the incoming value from before the BB with the /// value for the next execution. This can happen if the PHI WRITE is added to /// the statement with the instruction that defines the incoming value (instead /// of the last statement of the same BB). To ensure that the PHI READ and WRITE /// are in order, we put both into the statement. PHI WRITEs are always executed /// after PHI READs when they are in the same statement. /// /// TODO: This is an overpessimization. We only have to ensure that the PHI /// WRITE is not put into a statement containing the PHI itself. That could also /// be done by /// - having all (strongly connected) PHIs in a single statement, /// - unite only the PHIs in the operand tree of the PHI WRITE (because it only /// has a chance of being lifted before a PHI by being in a statement with a /// PHI that comes before in the basic block), or /// - when uniting statements, ensure that no (relevant) PHIs are overtaken. static void joinOrderedPHIs(EquivalenceClasses &UnionFind, ArrayRef ModeledInsts) { for (Instruction *Inst : ModeledInsts) { PHINode *PHI = dyn_cast(Inst); if (!PHI) continue; int Idx = PHI->getBasicBlockIndex(PHI->getParent()); if (Idx < 0) continue; Instruction *IncomingVal = dyn_cast(PHI->getIncomingValue(Idx)); if (!IncomingVal) continue; UnionFind.unionSets(PHI, IncomingVal); } } void ScopBuilder::buildEqivClassBlockStmts(BasicBlock *BB) { Loop *L = LI.getLoopFor(BB); // Extracting out modeled instructions saves us from checking // shouldModelInst() repeatedly. SmallVector ModeledInsts; EquivalenceClasses UnionFind; Instruction *MainInst = nullptr; for (Instruction &Inst : *BB) { if (!shouldModelInst(&Inst, L)) continue; ModeledInsts.push_back(&Inst); UnionFind.insert(&Inst); // When a BB is split into multiple statements, the main statement is the // one containing the 'main' instruction. We select the first instruction // that is unlikely to be removed (because it has side-effects) as the main // one. It is used to ensure that at least one statement from the bb has the // same name as with -polly-stmt-granularity=bb. if (!MainInst && (isa(Inst) || (isa(Inst) && !isa(Inst)))) MainInst = &Inst; } joinOperandTree(UnionFind, ModeledInsts); joinOrderedInstructions(UnionFind, ModeledInsts); joinOrderedPHIs(UnionFind, ModeledInsts); // The list of instructions for statement (statement represented by the leader // instruction). The order of statements instructions is reversed such that // the epilogue is first. This makes it easier to ensure that the epilogue is // the last statement. MapVector> LeaderToInstList; // Collect the instructions of all leaders. UnionFind's member iterator // unfortunately are not in any specific order. for (Instruction &Inst : reverse(*BB)) { auto LeaderIt = UnionFind.findLeader(&Inst); if (LeaderIt == UnionFind.member_end()) continue; std::vector &InstList = LeaderToInstList[*LeaderIt]; InstList.push_back(&Inst); } // Finally build the statements. int Count = 0; long BBIdx = scop->getNextStmtIdx(); bool MainFound = false; for (auto &Instructions : reverse(LeaderToInstList)) { std::vector &InstList = Instructions.second; // If there is no main instruction, make the first statement the main. bool IsMain; if (MainInst) IsMain = std::find(InstList.begin(), InstList.end(), MainInst) != InstList.end(); else IsMain = (Count == 0); if (IsMain) MainFound = true; std::reverse(InstList.begin(), InstList.end()); std::string Name = makeStmtName(BB, BBIdx, Count, IsMain); scop->addScopStmt(BB, Name, L, std::move(InstList)); Count += 1; } // Unconditionally add an epilogue (last statement). It contains no // instructions, but holds the PHI write accesses for successor basic blocks, // if the incoming value is not defined in another statement if the same BB. // The epilogue will be removed if no PHIWrite is added to it. std::string EpilogueName = makeStmtName(BB, BBIdx, Count, !MainFound, true); scop->addScopStmt(BB, EpilogueName, L, {}); } void ScopBuilder::buildStmts(Region &SR) { if (scop->isNonAffineSubRegion(&SR)) { std::vector Instructions; Loop *SurroundingLoop = getFirstNonBoxedLoopFor(SR.getEntry(), LI, scop->getBoxedLoops()); for (Instruction &Inst : *SR.getEntry()) if (shouldModelInst(&Inst, SurroundingLoop)) Instructions.push_back(&Inst); long RIdx = scop->getNextStmtIdx(); std::string Name = makeStmtName(&SR, RIdx); scop->addScopStmt(&SR, Name, SurroundingLoop, Instructions); return; } for (auto I = SR.element_begin(), E = SR.element_end(); I != E; ++I) if (I->isSubRegion()) buildStmts(*I->getNodeAs()); else { BasicBlock *BB = I->getNodeAs(); switch (StmtGranularity) { case GranularityChoice::BasicBlocks: buildSequentialBlockStmts(BB); break; case GranularityChoice::ScalarIndependence: buildEqivClassBlockStmts(BB); break; case GranularityChoice::Stores: buildSequentialBlockStmts(BB, true); break; } } } void ScopBuilder::buildAccessFunctions(ScopStmt *Stmt, BasicBlock &BB, Region *NonAffineSubRegion) { assert( Stmt && "The exit BB is the only one that cannot be represented by a statement"); assert(Stmt->represents(&BB)); // We do not build access functions for error blocks, as they may contain // instructions we can not model. if (isErrorBlock(BB, scop->getRegion(), LI, DT)) return; auto BuildAccessesForInst = [this, Stmt, NonAffineSubRegion](Instruction *Inst) { PHINode *PHI = dyn_cast(Inst); if (PHI) buildPHIAccesses(Stmt, PHI, NonAffineSubRegion, false); if (auto MemInst = MemAccInst::dyn_cast(*Inst)) { assert(Stmt && "Cannot build access function in non-existing statement"); buildMemoryAccess(MemInst, Stmt); } // PHI nodes have already been modeled above and terminators that are // not part of a non-affine subregion are fully modeled and regenerated // from the polyhedral domains. Hence, they do not need to be modeled as // explicit data dependences. if (!PHI) buildScalarDependences(Stmt, Inst); }; const InvariantLoadsSetTy &RIL = scop->getRequiredInvariantLoads(); bool IsEntryBlock = (Stmt->getEntryBlock() == &BB); if (IsEntryBlock) { for (Instruction *Inst : Stmt->getInstructions()) BuildAccessesForInst(Inst); if (Stmt->isRegionStmt()) BuildAccessesForInst(BB.getTerminator()); } else { for (Instruction &Inst : BB) { if (isIgnoredIntrinsic(&Inst)) continue; // Invariant loads already have been processed. if (isa(Inst) && RIL.count(cast(&Inst))) continue; BuildAccessesForInst(&Inst); } } } MemoryAccess *ScopBuilder::addMemoryAccess( ScopStmt *Stmt, Instruction *Inst, MemoryAccess::AccessType AccType, Value *BaseAddress, Type *ElementType, bool Affine, Value *AccessValue, ArrayRef Subscripts, ArrayRef Sizes, MemoryKind Kind) { bool isKnownMustAccess = false; // Accesses in single-basic block statements are always executed. if (Stmt->isBlockStmt()) isKnownMustAccess = true; if (Stmt->isRegionStmt()) { // Accesses that dominate the exit block of a non-affine region are always // executed. In non-affine regions there may exist MemoryKind::Values that // do not dominate the exit. MemoryKind::Values will always dominate the // exit and MemoryKind::PHIs only if there is at most one PHI_WRITE in the // non-affine region. if (Inst && DT.dominates(Inst->getParent(), Stmt->getRegion()->getExit())) isKnownMustAccess = true; } // Non-affine PHI writes do not "happen" at a particular instruction, but // after exiting the statement. Therefore they are guaranteed to execute and // overwrite the old value. if (Kind == MemoryKind::PHI || Kind == MemoryKind::ExitPHI) isKnownMustAccess = true; if (!isKnownMustAccess && AccType == MemoryAccess::MUST_WRITE) AccType = MemoryAccess::MAY_WRITE; auto *Access = new MemoryAccess(Stmt, Inst, AccType, BaseAddress, ElementType, Affine, Subscripts, Sizes, AccessValue, Kind); scop->addAccessFunction(Access); Stmt->addAccess(Access); return Access; } void ScopBuilder::addArrayAccess(ScopStmt *Stmt, MemAccInst MemAccInst, MemoryAccess::AccessType AccType, Value *BaseAddress, Type *ElementType, bool IsAffine, ArrayRef Subscripts, ArrayRef Sizes, Value *AccessValue) { ArrayBasePointers.insert(BaseAddress); auto *MemAccess = addMemoryAccess(Stmt, MemAccInst, AccType, BaseAddress, ElementType, IsAffine, AccessValue, Subscripts, Sizes, MemoryKind::Array); if (!DetectFortranArrays) return; if (Value *FAD = findFADAllocationInvisible(MemAccInst)) MemAccess->setFortranArrayDescriptor(FAD); else if (Value *FAD = findFADAllocationVisible(MemAccInst)) MemAccess->setFortranArrayDescriptor(FAD); } /// Check if @p Expr is divisible by @p Size. static bool isDivisible(const SCEV *Expr, unsigned Size, ScalarEvolution &SE) { assert(Size != 0); if (Size == 1) return true; // Only one factor needs to be divisible. if (auto *MulExpr = dyn_cast(Expr)) { for (auto *FactorExpr : MulExpr->operands()) if (isDivisible(FactorExpr, Size, SE)) return true; return false; } // For other n-ary expressions (Add, AddRec, Max,...) all operands need // to be divisible. if (auto *NAryExpr = dyn_cast(Expr)) { for (auto *OpExpr : NAryExpr->operands()) if (!isDivisible(OpExpr, Size, SE)) return false; return true; } auto *SizeSCEV = SE.getConstant(Expr->getType(), Size); auto *UDivSCEV = SE.getUDivExpr(Expr, SizeSCEV); auto *MulSCEV = SE.getMulExpr(UDivSCEV, SizeSCEV); return MulSCEV == Expr; } void ScopBuilder::foldSizeConstantsToRight() { isl::union_set Accessed = scop->getAccesses().range(); for (auto Array : scop->arrays()) { if (Array->getNumberOfDimensions() <= 1) continue; isl::space Space = Array->getSpace(); Space = Space.align_params(Accessed.get_space()); if (!Accessed.contains(Space)) continue; isl::set Elements = Accessed.extract_set(Space); isl::map Transform = isl::map::universe(Array->getSpace().map_from_set()); std::vector Int; int Dims = Elements.dim(isl::dim::set); for (int i = 0; i < Dims; i++) { isl::set DimOnly = isl::set(Elements).project_out(isl::dim::set, 0, i); DimOnly = DimOnly.project_out(isl::dim::set, 1, Dims - i - 1); DimOnly = DimOnly.lower_bound_si(isl::dim::set, 0, 0); isl::basic_set DimHull = DimOnly.affine_hull(); if (i == Dims - 1) { Int.push_back(1); Transform = Transform.equate(isl::dim::in, i, isl::dim::out, i); continue; } if (DimHull.dim(isl::dim::div) == 1) { isl::aff Diff = DimHull.get_div(0); isl::val Val = Diff.get_denominator_val(); int ValInt = 1; if (Val.is_int()) { auto ValAPInt = APIntFromVal(Val); if (ValAPInt.isSignedIntN(32)) ValInt = ValAPInt.getSExtValue(); } else { } Int.push_back(ValInt); isl::constraint C = isl::constraint::alloc_equality( isl::local_space(Transform.get_space())); C = C.set_coefficient_si(isl::dim::out, i, ValInt); C = C.set_coefficient_si(isl::dim::in, i, -1); Transform = Transform.add_constraint(C); continue; } isl::basic_set ZeroSet = isl::basic_set(DimHull); ZeroSet = ZeroSet.fix_si(isl::dim::set, 0, 0); int ValInt = 1; if (ZeroSet.is_equal(DimHull)) { ValInt = 0; } Int.push_back(ValInt); Transform = Transform.equate(isl::dim::in, i, isl::dim::out, i); } isl::set MappedElements = isl::map(Transform).domain(); if (!Elements.is_subset(MappedElements)) continue; bool CanFold = true; if (Int[0] <= 1) CanFold = false; unsigned NumDims = Array->getNumberOfDimensions(); for (unsigned i = 1; i < NumDims - 1; i++) if (Int[0] != Int[i] && Int[i]) CanFold = false; if (!CanFold) continue; for (auto &Access : scop->access_functions()) if (Access->getScopArrayInfo() == Array) Access->setAccessRelation( Access->getAccessRelation().apply_range(Transform)); std::vector Sizes; for (unsigned i = 0; i < NumDims; i++) { auto Size = Array->getDimensionSize(i); if (i == NumDims - 1) Size = SE.getMulExpr(Size, SE.getConstant(Size->getType(), Int[0])); Sizes.push_back(Size); } Array->updateSizes(Sizes, false /* CheckConsistency */); } } void ScopBuilder::markFortranArrays() { for (ScopStmt &Stmt : *scop) { for (MemoryAccess *MemAcc : Stmt) { Value *FAD = MemAcc->getFortranArrayDescriptor(); if (!FAD) continue; // TODO: const_cast-ing to edit ScopArrayInfo *SAI = const_cast(MemAcc->getLatestScopArrayInfo()); assert(SAI && "memory access into a Fortran array does not " "have an associated ScopArrayInfo"); SAI->applyAndSetFAD(FAD); } } } void ScopBuilder::finalizeAccesses() { updateAccessDimensionality(); foldSizeConstantsToRight(); foldAccessRelations(); assumeNoOutOfBounds(); markFortranArrays(); } void ScopBuilder::updateAccessDimensionality() { // Check all array accesses for each base pointer and find a (virtual) element // size for the base pointer that divides all access functions. for (ScopStmt &Stmt : *scop) for (MemoryAccess *Access : Stmt) { if (!Access->isArrayKind()) continue; ScopArrayInfo *Array = const_cast(Access->getScopArrayInfo()); if (Array->getNumberOfDimensions() != 1) continue; unsigned DivisibleSize = Array->getElemSizeInBytes(); const SCEV *Subscript = Access->getSubscript(0); while (!isDivisible(Subscript, DivisibleSize, SE)) DivisibleSize /= 2; auto *Ty = IntegerType::get(SE.getContext(), DivisibleSize * 8); Array->updateElementType(Ty); } for (auto &Stmt : *scop) for (auto &Access : Stmt) Access->updateDimensionality(); } void ScopBuilder::foldAccessRelations() { for (auto &Stmt : *scop) for (auto &Access : Stmt) Access->foldAccessRelation(); } void ScopBuilder::assumeNoOutOfBounds() { for (auto &Stmt : *scop) for (auto &Access : Stmt) Access->assumeNoOutOfBound(); } void ScopBuilder::ensureValueWrite(Instruction *Inst) { // Find the statement that defines the value of Inst. That statement has to // write the value to make it available to those statements that read it. ScopStmt *Stmt = scop->getStmtFor(Inst); // It is possible that the value is synthesizable within a loop (such that it // is not part of any statement), but not after the loop (where you need the // number of loop round-trips to synthesize it). In LCSSA-form a PHI node will // avoid this. In case the IR has no such PHI, use the last statement (where // the value is synthesizable) to write the value. if (!Stmt) Stmt = scop->getLastStmtFor(Inst->getParent()); // Inst not defined within this SCoP. if (!Stmt) return; // Do not process further if the instruction is already written. if (Stmt->lookupValueWriteOf(Inst)) return; addMemoryAccess(Stmt, Inst, MemoryAccess::MUST_WRITE, Inst, Inst->getType(), true, Inst, ArrayRef(), ArrayRef(), MemoryKind::Value); } void ScopBuilder::ensureValueRead(Value *V, ScopStmt *UserStmt) { // TODO: Make ScopStmt::ensureValueRead(Value*) offer the same functionality // to be able to replace this one. Currently, there is a split responsibility. // In a first step, the MemoryAccess is created, but without the // AccessRelation. In the second step by ScopStmt::buildAccessRelations(), the // AccessRelation is created. At least for scalar accesses, there is no new // information available at ScopStmt::buildAccessRelations(), so we could // create the AccessRelation right away. This is what // ScopStmt::ensureValueRead(Value*) does. auto *Scope = UserStmt->getSurroundingLoop(); auto VUse = VirtualUse::create(scop.get(), UserStmt, Scope, V, false); switch (VUse.getKind()) { case VirtualUse::Constant: case VirtualUse::Block: case VirtualUse::Synthesizable: case VirtualUse::Hoisted: case VirtualUse::Intra: // Uses of these kinds do not need a MemoryAccess. break; case VirtualUse::ReadOnly: // Add MemoryAccess for invariant values only if requested. if (!ModelReadOnlyScalars) break; LLVM_FALLTHROUGH; case VirtualUse::Inter: // Do not create another MemoryAccess for reloading the value if one already // exists. if (UserStmt->lookupValueReadOf(V)) break; addMemoryAccess(UserStmt, nullptr, MemoryAccess::READ, V, V->getType(), true, V, ArrayRef(), ArrayRef(), MemoryKind::Value); // Inter-statement uses need to write the value in their defining statement. if (VUse.isInter()) ensureValueWrite(cast(V)); break; } } void ScopBuilder::ensurePHIWrite(PHINode *PHI, ScopStmt *IncomingStmt, BasicBlock *IncomingBlock, Value *IncomingValue, bool IsExitBlock) { // As the incoming block might turn out to be an error statement ensure we // will create an exit PHI SAI object. It is needed during code generation // and would be created later anyway. if (IsExitBlock) scop->getOrCreateScopArrayInfo(PHI, PHI->getType(), {}, MemoryKind::ExitPHI); // This is possible if PHI is in the SCoP's entry block. The incoming blocks // from outside the SCoP's region have no statement representation. if (!IncomingStmt) return; // Take care for the incoming value being available in the incoming block. // This must be done before the check for multiple PHI writes because multiple // exiting edges from subregion each can be the effective written value of the // subregion. As such, all of them must be made available in the subregion // statement. ensureValueRead(IncomingValue, IncomingStmt); // Do not add more than one MemoryAccess per PHINode and ScopStmt. if (MemoryAccess *Acc = IncomingStmt->lookupPHIWriteOf(PHI)) { assert(Acc->getAccessInstruction() == PHI); Acc->addIncoming(IncomingBlock, IncomingValue); return; } MemoryAccess *Acc = addMemoryAccess( IncomingStmt, PHI, MemoryAccess::MUST_WRITE, PHI, PHI->getType(), true, PHI, ArrayRef(), ArrayRef(), IsExitBlock ? MemoryKind::ExitPHI : MemoryKind::PHI); assert(Acc); Acc->addIncoming(IncomingBlock, IncomingValue); } void ScopBuilder::addPHIReadAccess(ScopStmt *PHIStmt, PHINode *PHI) { addMemoryAccess(PHIStmt, PHI, MemoryAccess::READ, PHI, PHI->getType(), true, PHI, ArrayRef(), ArrayRef(), MemoryKind::PHI); } void ScopBuilder::buildDomain(ScopStmt &Stmt) { isl::id Id = isl::id::alloc(scop->getIslCtx(), Stmt.getBaseName(), &Stmt); Stmt.Domain = scop->getDomainConditions(&Stmt); Stmt.Domain = Stmt.Domain.set_tuple_id(Id); } void ScopBuilder::collectSurroundingLoops(ScopStmt &Stmt) { isl::set Domain = Stmt.getDomain(); BasicBlock *BB = Stmt.getEntryBlock(); Loop *L = LI.getLoopFor(BB); while (L && Stmt.isRegionStmt() && Stmt.getRegion()->contains(L)) L = L->getParentLoop(); SmallVector Loops; while (L && Stmt.getParent()->getRegion().contains(L)) { Loops.push_back(L); L = L->getParentLoop(); } Stmt.NestLoops.insert(Stmt.NestLoops.begin(), Loops.rbegin(), Loops.rend()); } /// Return the reduction type for a given binary operator. static MemoryAccess::ReductionType getReductionType(const BinaryOperator *BinOp, const Instruction *Load) { if (!BinOp) return MemoryAccess::RT_NONE; switch (BinOp->getOpcode()) { case Instruction::FAdd: if (!BinOp->isFast()) return MemoryAccess::RT_NONE; LLVM_FALLTHROUGH; case Instruction::Add: return MemoryAccess::RT_ADD; case Instruction::Or: return MemoryAccess::RT_BOR; case Instruction::Xor: return MemoryAccess::RT_BXOR; case Instruction::And: return MemoryAccess::RT_BAND; case Instruction::FMul: if (!BinOp->isFast()) return MemoryAccess::RT_NONE; LLVM_FALLTHROUGH; case Instruction::Mul: if (DisableMultiplicativeReductions) return MemoryAccess::RT_NONE; return MemoryAccess::RT_MUL; default: return MemoryAccess::RT_NONE; } } void ScopBuilder::checkForReductions(ScopStmt &Stmt) { SmallVector Loads; SmallVector, 4> Candidates; // First collect candidate load-store reduction chains by iterating over all // stores and collecting possible reduction loads. for (MemoryAccess *StoreMA : Stmt) { if (StoreMA->isRead()) continue; Loads.clear(); collectCandidateReductionLoads(StoreMA, Loads); for (MemoryAccess *LoadMA : Loads) Candidates.push_back(std::make_pair(LoadMA, StoreMA)); } // Then check each possible candidate pair. for (const auto &CandidatePair : Candidates) { bool Valid = true; isl::map LoadAccs = CandidatePair.first->getAccessRelation(); isl::map StoreAccs = CandidatePair.second->getAccessRelation(); // Skip those with obviously unequal base addresses. if (!LoadAccs.has_equal_space(StoreAccs)) { continue; } // And check if the remaining for overlap with other memory accesses. isl::map AllAccsRel = LoadAccs.unite(StoreAccs); AllAccsRel = AllAccsRel.intersect_domain(Stmt.getDomain()); isl::set AllAccs = AllAccsRel.range(); for (MemoryAccess *MA : Stmt) { if (MA == CandidatePair.first || MA == CandidatePair.second) continue; isl::map AccRel = MA->getAccessRelation().intersect_domain(Stmt.getDomain()); isl::set Accs = AccRel.range(); if (AllAccs.has_equal_space(Accs)) { isl::set OverlapAccs = Accs.intersect(AllAccs); Valid = Valid && OverlapAccs.is_empty(); } } if (!Valid) continue; const LoadInst *Load = dyn_cast(CandidatePair.first->getAccessInstruction()); MemoryAccess::ReductionType RT = getReductionType(dyn_cast(Load->user_back()), Load); // If no overlapping access was found we mark the load and store as // reduction like. CandidatePair.first->markAsReductionLike(RT); CandidatePair.second->markAsReductionLike(RT); } } void ScopBuilder::verifyInvariantLoads() { auto &RIL = scop->getRequiredInvariantLoads(); for (LoadInst *LI : RIL) { assert(LI && scop->contains(LI)); // If there exists a statement in the scop which has a memory access for // @p LI, then mark this scop as infeasible for optimization. for (ScopStmt &Stmt : *scop) if (Stmt.getArrayAccessOrNULLFor(LI)) { scop->invalidate(INVARIANTLOAD, LI->getDebugLoc(), LI->getParent()); return; } } } void ScopBuilder::hoistInvariantLoads() { if (!PollyInvariantLoadHoisting) return; isl::union_map Writes = scop->getWrites(); for (ScopStmt &Stmt : *scop) { InvariantAccessesTy InvariantAccesses; for (MemoryAccess *Access : Stmt) if (isl::set NHCtx = getNonHoistableCtx(Access, Writes)) InvariantAccesses.push_back({Access, NHCtx}); // Transfer the memory access from the statement to the SCoP. for (auto InvMA : InvariantAccesses) Stmt.removeMemoryAccess(InvMA.MA); addInvariantLoads(Stmt, InvariantAccesses); } } /// Check if an access range is too complex. /// /// An access range is too complex, if it contains either many disjuncts or /// very complex expressions. As a simple heuristic, we assume if a set to /// be too complex if the sum of existentially quantified dimensions and /// set dimensions is larger than a threshold. This reliably detects both /// sets with many disjuncts as well as sets with many divisions as they /// arise in h264. /// /// @param AccessRange The range to check for complexity. /// /// @returns True if the access range is too complex. static bool isAccessRangeTooComplex(isl::set AccessRange) { int NumTotalDims = 0; for (isl::basic_set BSet : AccessRange.get_basic_set_list()) { NumTotalDims += BSet.dim(isl::dim::div); NumTotalDims += BSet.dim(isl::dim::set); } if (NumTotalDims > MaxDimensionsInAccessRange) return true; return false; } bool ScopBuilder::hasNonHoistableBasePtrInScop(MemoryAccess *MA, isl::union_map Writes) { if (auto *BasePtrMA = scop->lookupBasePtrAccess(MA)) { return getNonHoistableCtx(BasePtrMA, Writes).is_null(); } Value *BaseAddr = MA->getOriginalBaseAddr(); if (auto *BasePtrInst = dyn_cast(BaseAddr)) if (!isa(BasePtrInst)) return scop->contains(BasePtrInst); return false; } void ScopBuilder::addUserContext() { if (UserContextStr.empty()) return; isl::set UserContext = isl::set(scop->getIslCtx(), UserContextStr.c_str()); isl::space Space = scop->getParamSpace(); if (Space.dim(isl::dim::param) != UserContext.dim(isl::dim::param)) { std::string SpaceStr = Space.to_str(); errs() << "Error: the context provided in -polly-context has not the same " << "number of dimensions than the computed context. Due to this " << "mismatch, the -polly-context option is ignored. Please provide " << "the context in the parameter space: " << SpaceStr << ".\n"; return; } for (unsigned i = 0; i < Space.dim(isl::dim::param); i++) { std::string NameContext = scop->getContext().get_dim_name(isl::dim::param, i); std::string NameUserContext = UserContext.get_dim_name(isl::dim::param, i); if (NameContext != NameUserContext) { std::string SpaceStr = Space.to_str(); errs() << "Error: the name of dimension " << i << " provided in -polly-context " << "is '" << NameUserContext << "', but the name in the computed " << "context is '" << NameContext << "'. Due to this name mismatch, " << "the -polly-context option is ignored. Please provide " << "the context in the parameter space: " << SpaceStr << ".\n"; return; } UserContext = UserContext.set_dim_id(isl::dim::param, i, Space.get_dim_id(isl::dim::param, i)); } isl::set newContext = scop->getContext().intersect(UserContext); scop->setContext(newContext); } isl::set ScopBuilder::getNonHoistableCtx(MemoryAccess *Access, isl::union_map Writes) { // TODO: Loads that are not loop carried, hence are in a statement with // zero iterators, are by construction invariant, though we // currently "hoist" them anyway. This is necessary because we allow // them to be treated as parameters (e.g., in conditions) and our code // generation would otherwise use the old value. auto &Stmt = *Access->getStatement(); BasicBlock *BB = Stmt.getEntryBlock(); if (Access->isScalarKind() || Access->isWrite() || !Access->isAffine() || Access->isMemoryIntrinsic()) return nullptr; // Skip accesses that have an invariant base pointer which is defined but // not loaded inside the SCoP. This can happened e.g., if a readnone call // returns a pointer that is used as a base address. However, as we want // to hoist indirect pointers, we allow the base pointer to be defined in // the region if it is also a memory access. Each ScopArrayInfo object // that has a base pointer origin has a base pointer that is loaded and // that it is invariant, thus it will be hoisted too. However, if there is // no base pointer origin we check that the base pointer is defined // outside the region. auto *LI = cast(Access->getAccessInstruction()); if (hasNonHoistableBasePtrInScop(Access, Writes)) return nullptr; isl::map AccessRelation = Access->getAccessRelation(); assert(!AccessRelation.is_empty()); if (AccessRelation.involves_dims(isl::dim::in, 0, Stmt.getNumIterators())) return nullptr; AccessRelation = AccessRelation.intersect_domain(Stmt.getDomain()); isl::set SafeToLoad; auto &DL = scop->getFunction().getParent()->getDataLayout(); if (isSafeToLoadUnconditionally(LI->getPointerOperand(), LI->getType(), LI->getAlignment(), DL)) { SafeToLoad = isl::set::universe(AccessRelation.get_space().range()); } else if (BB != LI->getParent()) { // Skip accesses in non-affine subregions as they might not be executed // under the same condition as the entry of the non-affine subregion. return nullptr; } else { SafeToLoad = AccessRelation.range(); } if (isAccessRangeTooComplex(AccessRelation.range())) return nullptr; isl::union_map Written = Writes.intersect_range(SafeToLoad); isl::set WrittenCtx = Written.params(); bool IsWritten = !WrittenCtx.is_empty(); if (!IsWritten) return WrittenCtx; WrittenCtx = WrittenCtx.remove_divs(); bool TooComplex = WrittenCtx.n_basic_set() >= MaxDisjunctsInDomain; if (TooComplex || !isRequiredInvariantLoad(LI)) return nullptr; scop->addAssumption(INVARIANTLOAD, WrittenCtx, LI->getDebugLoc(), AS_RESTRICTION, LI->getParent()); return WrittenCtx; } static bool isAParameter(llvm::Value *maybeParam, const Function &F) { for (const llvm::Argument &Arg : F.args()) if (&Arg == maybeParam) return true; return false; } bool ScopBuilder::canAlwaysBeHoisted(MemoryAccess *MA, bool StmtInvalidCtxIsEmpty, bool MAInvalidCtxIsEmpty, bool NonHoistableCtxIsEmpty) { LoadInst *LInst = cast(MA->getAccessInstruction()); const DataLayout &DL = LInst->getParent()->getModule()->getDataLayout(); if (PollyAllowDereferenceOfAllFunctionParams && isAParameter(LInst->getPointerOperand(), scop->getFunction())) return true; // TODO: We can provide more information for better but more expensive // results. if (!isDereferenceableAndAlignedPointer(LInst->getPointerOperand(), LInst->getType(), LInst->getAlignment(), DL)) return false; // If the location might be overwritten we do not hoist it unconditionally. // // TODO: This is probably too conservative. if (!NonHoistableCtxIsEmpty) return false; // If a dereferenceable load is in a statement that is modeled precisely we // can hoist it. if (StmtInvalidCtxIsEmpty && MAInvalidCtxIsEmpty) return true; // Even if the statement is not modeled precisely we can hoist the load if it // does not involve any parameters that might have been specialized by the // statement domain. for (unsigned u = 0, e = MA->getNumSubscripts(); u < e; u++) if (!isa(MA->getSubscript(u))) return false; return true; } void ScopBuilder::addInvariantLoads(ScopStmt &Stmt, InvariantAccessesTy &InvMAs) { if (InvMAs.empty()) return; isl::set StmtInvalidCtx = Stmt.getInvalidContext(); bool StmtInvalidCtxIsEmpty = StmtInvalidCtx.is_empty(); // Get the context under which the statement is executed but remove the error // context under which this statement is reached. isl::set DomainCtx = Stmt.getDomain().params(); DomainCtx = DomainCtx.subtract(StmtInvalidCtx); if (DomainCtx.n_basic_set() >= MaxDisjunctsInDomain) { auto *AccInst = InvMAs.front().MA->getAccessInstruction(); scop->invalidate(COMPLEXITY, AccInst->getDebugLoc(), AccInst->getParent()); return; } // Project out all parameters that relate to loads in the statement. Otherwise // we could have cyclic dependences on the constraints under which the // hoisted loads are executed and we could not determine an order in which to // pre-load them. This happens because not only lower bounds are part of the // domain but also upper bounds. for (auto &InvMA : InvMAs) { auto *MA = InvMA.MA; Instruction *AccInst = MA->getAccessInstruction(); if (SE.isSCEVable(AccInst->getType())) { SetVector Values; for (const SCEV *Parameter : scop->parameters()) { Values.clear(); findValues(Parameter, SE, Values); if (!Values.count(AccInst)) continue; if (isl::id ParamId = scop->getIdForParam(Parameter)) { int Dim = DomainCtx.find_dim_by_id(isl::dim::param, ParamId); if (Dim >= 0) DomainCtx = DomainCtx.eliminate(isl::dim::param, Dim, 1); } } } } for (auto &InvMA : InvMAs) { auto *MA = InvMA.MA; isl::set NHCtx = InvMA.NonHoistableCtx; // Check for another invariant access that accesses the same location as // MA and if found consolidate them. Otherwise create a new equivalence // class at the end of InvariantEquivClasses. LoadInst *LInst = cast(MA->getAccessInstruction()); Type *Ty = LInst->getType(); const SCEV *PointerSCEV = SE.getSCEV(LInst->getPointerOperand()); isl::set MAInvalidCtx = MA->getInvalidContext(); bool NonHoistableCtxIsEmpty = NHCtx.is_empty(); bool MAInvalidCtxIsEmpty = MAInvalidCtx.is_empty(); isl::set MACtx; // Check if we know that this pointer can be speculatively accessed. if (canAlwaysBeHoisted(MA, StmtInvalidCtxIsEmpty, MAInvalidCtxIsEmpty, NonHoistableCtxIsEmpty)) { MACtx = isl::set::universe(DomainCtx.get_space()); } else { MACtx = DomainCtx; MACtx = MACtx.subtract(MAInvalidCtx.unite(NHCtx)); MACtx = MACtx.gist_params(scop->getContext()); } bool Consolidated = false; for (auto &IAClass : scop->invariantEquivClasses()) { if (PointerSCEV != IAClass.IdentifyingPointer || Ty != IAClass.AccessType) continue; // If the pointer and the type is equal check if the access function wrt. // to the domain is equal too. It can happen that the domain fixes // parameter values and these can be different for distinct part of the // SCoP. If this happens we cannot consolidate the loads but need to // create a new invariant load equivalence class. auto &MAs = IAClass.InvariantAccesses; if (!MAs.empty()) { auto *LastMA = MAs.front(); isl::set AR = MA->getAccessRelation().range(); isl::set LastAR = LastMA->getAccessRelation().range(); bool SameAR = AR.is_equal(LastAR); if (!SameAR) continue; } // Add MA to the list of accesses that are in this class. MAs.push_front(MA); Consolidated = true; // Unify the execution context of the class and this statement. isl::set IAClassDomainCtx = IAClass.ExecutionContext; if (IAClassDomainCtx) IAClassDomainCtx = IAClassDomainCtx.unite(MACtx).coalesce(); else IAClassDomainCtx = MACtx; IAClass.ExecutionContext = IAClassDomainCtx; break; } if (Consolidated) continue; MACtx = MACtx.coalesce(); // If we did not consolidate MA, thus did not find an equivalence class // for it, we create a new one. scop->addInvariantEquivClass( InvariantEquivClassTy{PointerSCEV, MemoryAccessList{MA}, MACtx, Ty}); } } void ScopBuilder::collectCandidateReductionLoads( MemoryAccess *StoreMA, SmallVectorImpl &Loads) { ScopStmt *Stmt = StoreMA->getStatement(); auto *Store = dyn_cast(StoreMA->getAccessInstruction()); if (!Store) return; // Skip if there is not one binary operator between the load and the store auto *BinOp = dyn_cast(Store->getValueOperand()); if (!BinOp) return; // Skip if the binary operators has multiple uses if (BinOp->getNumUses() != 1) return; // Skip if the opcode of the binary operator is not commutative/associative if (!BinOp->isCommutative() || !BinOp->isAssociative()) return; // Skip if the binary operator is outside the current SCoP if (BinOp->getParent() != Store->getParent()) return; // Skip if it is a multiplicative reduction and we disabled them if (DisableMultiplicativeReductions && (BinOp->getOpcode() == Instruction::Mul || BinOp->getOpcode() == Instruction::FMul)) return; // Check the binary operator operands for a candidate load auto *PossibleLoad0 = dyn_cast(BinOp->getOperand(0)); auto *PossibleLoad1 = dyn_cast(BinOp->getOperand(1)); if (!PossibleLoad0 && !PossibleLoad1) return; // A load is only a candidate if it cannot escape (thus has only this use) if (PossibleLoad0 && PossibleLoad0->getNumUses() == 1) if (PossibleLoad0->getParent() == Store->getParent()) Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad0)); if (PossibleLoad1 && PossibleLoad1->getNumUses() == 1) if (PossibleLoad1->getParent() == Store->getParent()) Loads.push_back(&Stmt->getArrayAccessFor(PossibleLoad1)); } /// Find the canonical scop array info object for a set of invariant load /// hoisted loads. The canonical array is the one that corresponds to the /// first load in the list of accesses which is used as base pointer of a /// scop array. static const ScopArrayInfo *findCanonicalArray(Scop &S, MemoryAccessList &Accesses) { for (MemoryAccess *Access : Accesses) { const ScopArrayInfo *CanonicalArray = S.getScopArrayInfoOrNull( Access->getAccessInstruction(), MemoryKind::Array); if (CanonicalArray) return CanonicalArray; } return nullptr; } /// Check if @p Array severs as base array in an invariant load. static bool isUsedForIndirectHoistedLoad(Scop &S, const ScopArrayInfo *Array) { for (InvariantEquivClassTy &EqClass2 : S.getInvariantAccesses()) for (MemoryAccess *Access2 : EqClass2.InvariantAccesses) if (Access2->getScopArrayInfo() == Array) return true; return false; } /// Replace the base pointer arrays in all memory accesses referencing @p Old, /// with a reference to @p New. static void replaceBasePtrArrays(Scop &S, const ScopArrayInfo *Old, const ScopArrayInfo *New) { for (ScopStmt &Stmt : S) for (MemoryAccess *Access : Stmt) { if (Access->getLatestScopArrayInfo() != Old) continue; isl::id Id = New->getBasePtrId(); isl::map Map = Access->getAccessRelation(); Map = Map.set_tuple_id(isl::dim::out, Id); Access->setAccessRelation(Map); } } void ScopBuilder::canonicalizeDynamicBasePtrs() { for (InvariantEquivClassTy &EqClass : scop->InvariantEquivClasses) { MemoryAccessList &BasePtrAccesses = EqClass.InvariantAccesses; const ScopArrayInfo *CanonicalBasePtrSAI = findCanonicalArray(*scop, BasePtrAccesses); if (!CanonicalBasePtrSAI) continue; for (MemoryAccess *BasePtrAccess : BasePtrAccesses) { const ScopArrayInfo *BasePtrSAI = scop->getScopArrayInfoOrNull( BasePtrAccess->getAccessInstruction(), MemoryKind::Array); if (!BasePtrSAI || BasePtrSAI == CanonicalBasePtrSAI || !BasePtrSAI->isCompatibleWith(CanonicalBasePtrSAI)) continue; // we currently do not canonicalize arrays where some accesses are // hoisted as invariant loads. If we would, we need to update the access // function of the invariant loads as well. However, as this is not a // very common situation, we leave this for now to avoid further // complexity increases. if (isUsedForIndirectHoistedLoad(*scop, BasePtrSAI)) continue; replaceBasePtrArrays(*scop, BasePtrSAI, CanonicalBasePtrSAI); } } } void ScopBuilder::buildAccessRelations(ScopStmt &Stmt) { for (MemoryAccess *Access : Stmt.MemAccs) { Type *ElementType = Access->getElementType(); MemoryKind Ty; if (Access->isPHIKind()) Ty = MemoryKind::PHI; else if (Access->isExitPHIKind()) Ty = MemoryKind::ExitPHI; else if (Access->isValueKind()) Ty = MemoryKind::Value; else Ty = MemoryKind::Array; auto *SAI = scop->getOrCreateScopArrayInfo(Access->getOriginalBaseAddr(), ElementType, Access->Sizes, Ty); Access->buildAccessRelation(SAI); scop->addAccessData(Access); } } /// Add the minimal/maximal access in @p Set to @p User. /// /// @return True if more accesses should be added, false if we reached the /// maximal number of run-time checks to be generated. static bool buildMinMaxAccess(isl::set Set, Scop::MinMaxVectorTy &MinMaxAccesses, Scop &S) { isl::pw_multi_aff MinPMA, MaxPMA; isl::pw_aff LastDimAff; isl::aff OneAff; unsigned Pos; Set = Set.remove_divs(); polly::simplify(Set); if (Set.n_basic_set() > RunTimeChecksMaxAccessDisjuncts) Set = Set.simple_hull(); // Restrict the number of parameters involved in the access as the lexmin/ // lexmax computation will take too long if this number is high. // // Experiments with a simple test case using an i7 4800MQ: // // #Parameters involved | Time (in sec) // 6 | 0.01 // 7 | 0.04 // 8 | 0.12 // 9 | 0.40 // 10 | 1.54 // 11 | 6.78 // 12 | 30.38 // if (isl_set_n_param(Set.get()) > RunTimeChecksMaxParameters) { unsigned InvolvedParams = 0; for (unsigned u = 0, e = isl_set_n_param(Set.get()); u < e; u++) if (Set.involves_dims(isl::dim::param, u, 1)) InvolvedParams++; if (InvolvedParams > RunTimeChecksMaxParameters) return false; } MinPMA = Set.lexmin_pw_multi_aff(); MaxPMA = Set.lexmax_pw_multi_aff(); MinPMA = MinPMA.coalesce(); MaxPMA = MaxPMA.coalesce(); // Adjust the last dimension of the maximal access by one as we want to // enclose the accessed memory region by MinPMA and MaxPMA. The pointer // we test during code generation might now point after the end of the // allocated array but we will never dereference it anyway. assert((!MaxPMA || MaxPMA.dim(isl::dim::out)) && "Assumed at least one output dimension"); Pos = MaxPMA.dim(isl::dim::out) - 1; LastDimAff = MaxPMA.get_pw_aff(Pos); OneAff = isl::aff(isl::local_space(LastDimAff.get_domain_space())); OneAff = OneAff.add_constant_si(1); LastDimAff = LastDimAff.add(OneAff); MaxPMA = MaxPMA.set_pw_aff(Pos, LastDimAff); if (!MinPMA || !MaxPMA) return false; MinMaxAccesses.push_back(std::make_pair(MinPMA, MaxPMA)); return true; } /// Wrapper function to calculate minimal/maximal accesses to each array. bool ScopBuilder::calculateMinMaxAccess(AliasGroupTy AliasGroup, Scop::MinMaxVectorTy &MinMaxAccesses) { MinMaxAccesses.reserve(AliasGroup.size()); isl::union_set Domains = scop->getDomains(); isl::union_map Accesses = isl::union_map::empty(scop->getParamSpace()); for (MemoryAccess *MA : AliasGroup) Accesses = Accesses.add_map(MA->getAccessRelation()); Accesses = Accesses.intersect_domain(Domains); isl::union_set Locations = Accesses.range(); bool LimitReached = false; for (isl::set Set : Locations.get_set_list()) { LimitReached |= !buildMinMaxAccess(Set, MinMaxAccesses, *scop); if (LimitReached) break; } return !LimitReached; } static isl::set getAccessDomain(MemoryAccess *MA) { isl::set Domain = MA->getStatement()->getDomain(); Domain = Domain.project_out(isl::dim::set, 0, Domain.n_dim()); return Domain.reset_tuple_id(); } bool ScopBuilder::buildAliasChecks() { if (!PollyUseRuntimeAliasChecks) return true; if (buildAliasGroups()) { // Aliasing assumptions do not go through addAssumption but we still want to // collect statistics so we do it here explicitly. if (scop->getAliasGroups().size()) Scop::incrementNumberOfAliasingAssumptions(1); return true; } // If a problem occurs while building the alias groups we need to delete // this SCoP and pretend it wasn't valid in the first place. To this end // we make the assumed context infeasible. scop->invalidate(ALIASING, DebugLoc()); LLVM_DEBUG( dbgs() << "\n\nNOTE: Run time checks for " << scop->getNameStr() << " could not be created as the number of parameters involved " "is too high. The SCoP will be " "dismissed.\nUse:\n\t--polly-rtc-max-parameters=X\nto adjust " "the maximal number of parameters but be advised that the " "compile time might increase exponentially.\n\n"); return false; } std::tuple> ScopBuilder::buildAliasGroupsForAccesses() { AliasSetTracker AST(AA); DenseMap PtrToAcc; DenseSet HasWriteAccess; for (ScopStmt &Stmt : *scop) { isl::set StmtDomain = Stmt.getDomain(); bool StmtDomainEmpty = StmtDomain.is_empty(); // Statements with an empty domain will never be executed. if (StmtDomainEmpty) continue; for (MemoryAccess *MA : Stmt) { if (MA->isScalarKind()) continue; if (!MA->isRead()) HasWriteAccess.insert(MA->getScopArrayInfo()); MemAccInst Acc(MA->getAccessInstruction()); if (MA->isRead() && isa(Acc)) PtrToAcc[cast(Acc)->getRawSource()] = MA; else PtrToAcc[Acc.getPointerOperand()] = MA; AST.add(Acc); } } AliasGroupVectorTy AliasGroups; for (AliasSet &AS : AST) { if (AS.isMustAlias() || AS.isForwardingAliasSet()) continue; AliasGroupTy AG; for (auto &PR : AS) AG.push_back(PtrToAcc[PR.getValue()]); if (AG.size() < 2) continue; AliasGroups.push_back(std::move(AG)); } return std::make_tuple(AliasGroups, HasWriteAccess); } bool ScopBuilder::buildAliasGroups() { // To create sound alias checks we perform the following steps: // o) We partition each group into read only and non read only accesses. // o) For each group with more than one base pointer we then compute minimal // and maximal accesses to each array of a group in read only and non // read only partitions separately. AliasGroupVectorTy AliasGroups; DenseSet HasWriteAccess; std::tie(AliasGroups, HasWriteAccess) = buildAliasGroupsForAccesses(); splitAliasGroupsByDomain(AliasGroups); for (AliasGroupTy &AG : AliasGroups) { if (!scop->hasFeasibleRuntimeContext()) return false; { IslMaxOperationsGuard MaxOpGuard(scop->getIslCtx().get(), OptComputeOut); bool Valid = buildAliasGroup(AG, HasWriteAccess); if (!Valid) return false; } if (isl_ctx_last_error(scop->getIslCtx().get()) == isl_error_quota) { scop->invalidate(COMPLEXITY, DebugLoc()); return false; } } return true; } bool ScopBuilder::buildAliasGroup( AliasGroupTy &AliasGroup, DenseSet HasWriteAccess) { AliasGroupTy ReadOnlyAccesses; AliasGroupTy ReadWriteAccesses; SmallPtrSet ReadWriteArrays; SmallPtrSet ReadOnlyArrays; if (AliasGroup.size() < 2) return true; for (MemoryAccess *Access : AliasGroup) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "PossibleAlias", Access->getAccessInstruction()) << "Possibly aliasing pointer, use restrict keyword."); const ScopArrayInfo *Array = Access->getScopArrayInfo(); if (HasWriteAccess.count(Array)) { ReadWriteArrays.insert(Array); ReadWriteAccesses.push_back(Access); } else { ReadOnlyArrays.insert(Array); ReadOnlyAccesses.push_back(Access); } } // If there are no read-only pointers, and less than two read-write pointers, // no alias check is needed. if (ReadOnlyAccesses.empty() && ReadWriteArrays.size() <= 1) return true; // If there is no read-write pointer, no alias check is needed. if (ReadWriteArrays.empty()) return true; // For non-affine accesses, no alias check can be generated as we cannot // compute a sufficiently tight lower and upper bound: bail out. for (MemoryAccess *MA : AliasGroup) { if (!MA->isAffine()) { scop->invalidate(ALIASING, MA->getAccessInstruction()->getDebugLoc(), MA->getAccessInstruction()->getParent()); return false; } } // Ensure that for all memory accesses for which we generate alias checks, // their base pointers are available. for (MemoryAccess *MA : AliasGroup) { if (MemoryAccess *BasePtrMA = scop->lookupBasePtrAccess(MA)) scop->addRequiredInvariantLoad( cast(BasePtrMA->getAccessInstruction())); } // scop->getAliasGroups().emplace_back(); // Scop::MinMaxVectorPairTy &pair = scop->getAliasGroups().back(); Scop::MinMaxVectorTy MinMaxAccessesReadWrite; Scop::MinMaxVectorTy MinMaxAccessesReadOnly; bool Valid; Valid = calculateMinMaxAccess(ReadWriteAccesses, MinMaxAccessesReadWrite); if (!Valid) return false; // Bail out if the number of values we need to compare is too large. // This is important as the number of comparisons grows quadratically with // the number of values we need to compare. if (MinMaxAccessesReadWrite.size() + ReadOnlyArrays.size() > RunTimeChecksMaxArraysPerGroup) return false; Valid = calculateMinMaxAccess(ReadOnlyAccesses, MinMaxAccessesReadOnly); scop->addAliasGroup(MinMaxAccessesReadWrite, MinMaxAccessesReadOnly); if (!Valid) return false; return true; } void ScopBuilder::splitAliasGroupsByDomain(AliasGroupVectorTy &AliasGroups) { for (unsigned u = 0; u < AliasGroups.size(); u++) { AliasGroupTy NewAG; AliasGroupTy &AG = AliasGroups[u]; AliasGroupTy::iterator AGI = AG.begin(); isl::set AGDomain = getAccessDomain(*AGI); while (AGI != AG.end()) { MemoryAccess *MA = *AGI; isl::set MADomain = getAccessDomain(MA); if (AGDomain.is_disjoint(MADomain)) { NewAG.push_back(MA); AGI = AG.erase(AGI); } else { AGDomain = AGDomain.unite(MADomain); AGI++; } } if (NewAG.size() > 1) AliasGroups.push_back(std::move(NewAG)); } } #ifndef NDEBUG static void verifyUse(Scop *S, Use &Op, LoopInfo &LI) { auto PhysUse = VirtualUse::create(S, Op, &LI, false); auto VirtUse = VirtualUse::create(S, Op, &LI, true); assert(PhysUse.getKind() == VirtUse.getKind()); } /// Check the consistency of every statement's MemoryAccesses. /// /// The check is carried out by expecting the "physical" kind of use (derived /// from the BasicBlocks instructions resides in) to be same as the "virtual" /// kind of use (derived from a statement's MemoryAccess). /// /// The "physical" uses are taken by ensureValueRead to determine whether to /// create MemoryAccesses. When done, the kind of scalar access should be the /// same no matter which way it was derived. /// /// The MemoryAccesses might be changed by later SCoP-modifying passes and hence /// can intentionally influence on the kind of uses (not corresponding to the /// "physical" anymore, hence called "virtual"). The CodeGenerator therefore has /// to pick up the virtual uses. But here in the code generator, this has not /// happened yet, such that virtual and physical uses are equivalent. static void verifyUses(Scop *S, LoopInfo &LI, DominatorTree &DT) { for (auto *BB : S->getRegion().blocks()) { for (auto &Inst : *BB) { auto *Stmt = S->getStmtFor(&Inst); if (!Stmt) continue; if (isIgnoredIntrinsic(&Inst)) continue; // Branch conditions are encoded in the statement domains. if (Inst.isTerminator() && Stmt->isBlockStmt()) continue; // Verify all uses. for (auto &Op : Inst.operands()) verifyUse(S, Op, LI); // Stores do not produce values used by other statements. if (isa(Inst)) continue; // For every value defined in the block, also check that a use of that // value in the same statement would not be an inter-statement use. It can // still be synthesizable or load-hoisted, but these kind of instructions // are not directly copied in code-generation. auto VirtDef = VirtualUse::create(S, Stmt, Stmt->getSurroundingLoop(), &Inst, true); assert(VirtDef.getKind() == VirtualUse::Synthesizable || VirtDef.getKind() == VirtualUse::Intra || VirtDef.getKind() == VirtualUse::Hoisted); } } if (S->hasSingleExitEdge()) return; // PHINodes in the SCoP region's exit block are also uses to be checked. if (!S->getRegion().isTopLevelRegion()) { for (auto &Inst : *S->getRegion().getExit()) { if (!isa(Inst)) break; for (auto &Op : Inst.operands()) verifyUse(S, Op, LI); } } } #endif /// Return the block that is the representing block for @p RN. static inline BasicBlock *getRegionNodeBasicBlock(RegionNode *RN) { return RN->isSubRegion() ? RN->getNodeAs()->getEntry() : RN->getNodeAs(); } void ScopBuilder::buildScop(Region &R, AssumptionCache &AC) { scop.reset(new Scop(R, SE, LI, DT, *SD.getDetectionContext(&R), ORE)); buildStmts(R); // Create all invariant load instructions first. These are categorized as // 'synthesizable', therefore are not part of any ScopStmt but need to be // created somewhere. const InvariantLoadsSetTy &RIL = scop->getRequiredInvariantLoads(); for (BasicBlock *BB : scop->getRegion().blocks()) { if (isErrorBlock(*BB, scop->getRegion(), LI, DT)) continue; for (Instruction &Inst : *BB) { LoadInst *Load = dyn_cast(&Inst); if (!Load) continue; if (!RIL.count(Load)) continue; // Invariant loads require a MemoryAccess to be created in some statement. // It is not important to which statement the MemoryAccess is added // because it will later be removed from the ScopStmt again. We chose the // first statement of the basic block the LoadInst is in. ArrayRef List = scop->getStmtListFor(BB); assert(!List.empty()); ScopStmt *RILStmt = List.front(); buildMemoryAccess(Load, RILStmt); } } buildAccessFunctions(); // In case the region does not have an exiting block we will later (during // code generation) split the exit block. This will move potential PHI nodes // from the current exit block into the new region exiting block. Hence, PHI // nodes that are at this point not part of the region will be. // To handle these PHI nodes later we will now model their operands as scalar // accesses. Note that we do not model anything in the exit block if we have // an exiting block in the region, as there will not be any splitting later. if (!R.isTopLevelRegion() && !scop->hasSingleExitEdge()) { for (Instruction &Inst : *R.getExit()) { PHINode *PHI = dyn_cast(&Inst); if (!PHI) break; buildPHIAccesses(nullptr, PHI, nullptr, true); } } // Create memory accesses for global reads since all arrays are now known. auto *AF = SE.getConstant(IntegerType::getInt64Ty(SE.getContext()), 0); for (auto GlobalReadPair : GlobalReads) { ScopStmt *GlobalReadStmt = GlobalReadPair.first; Instruction *GlobalRead = GlobalReadPair.second; for (auto *BP : ArrayBasePointers) addArrayAccess(GlobalReadStmt, MemAccInst(GlobalRead), MemoryAccess::READ, BP, BP->getType(), false, {AF}, {nullptr}, GlobalRead); } buildInvariantEquivalenceClasses(); /// A map from basic blocks to their invalid domains. DenseMap InvalidDomainMap; if (!scop->buildDomains(&R, DT, LI, InvalidDomainMap)) { LLVM_DEBUG( dbgs() << "Bailing-out because buildDomains encountered problems\n"); return; } scop->addUserAssumptions(AC, DT, LI, InvalidDomainMap); // Initialize the invalid domain. for (ScopStmt &Stmt : scop->Stmts) if (Stmt.isBlockStmt()) Stmt.setInvalidDomain(InvalidDomainMap[Stmt.getEntryBlock()]); else Stmt.setInvalidDomain(InvalidDomainMap[getRegionNodeBasicBlock( Stmt.getRegion()->getNode())]); // Remove empty statements. // Exit early in case there are no executable statements left in this scop. scop->removeStmtNotInDomainMap(); scop->simplifySCoP(false); if (scop->isEmpty()) { LLVM_DEBUG(dbgs() << "Bailing-out because SCoP is empty\n"); return; } // The ScopStmts now have enough information to initialize themselves. for (ScopStmt &Stmt : *scop) { collectSurroundingLoops(Stmt); buildDomain(Stmt); buildAccessRelations(Stmt); if (DetectReductions) checkForReductions(Stmt); } // Check early for a feasible runtime context. if (!scop->hasFeasibleRuntimeContext()) { LLVM_DEBUG(dbgs() << "Bailing-out because of unfeasible context (early)\n"); return; } // Check early for profitability. Afterwards it cannot change anymore, // only the runtime context could become infeasible. if (!scop->isProfitable(UnprofitableScalarAccs)) { scop->invalidate(PROFITABLE, DebugLoc()); LLVM_DEBUG( dbgs() << "Bailing-out because SCoP is not considered profitable\n"); return; } buildSchedule(); finalizeAccesses(); scop->realignParams(); addUserContext(); // After the context was fully constructed, thus all our knowledge about // the parameters is in there, we add all recorded assumptions to the // assumed/invalid context. addRecordedAssumptions(); scop->simplifyContexts(); if (!buildAliasChecks()) { LLVM_DEBUG(dbgs() << "Bailing-out because could not build alias checks\n"); return; } hoistInvariantLoads(); canonicalizeDynamicBasePtrs(); verifyInvariantLoads(); scop->simplifySCoP(true); // Check late for a feasible runtime context because profitability did not // change. if (!scop->hasFeasibleRuntimeContext()) { LLVM_DEBUG(dbgs() << "Bailing-out because of unfeasible context (late)\n"); return; } #ifndef NDEBUG verifyUses(scop.get(), LI, DT); #endif } ScopBuilder::ScopBuilder(Region *R, AssumptionCache &AC, AliasAnalysis &AA, const DataLayout &DL, DominatorTree &DT, LoopInfo &LI, ScopDetection &SD, ScalarEvolution &SE, OptimizationRemarkEmitter &ORE) : AA(AA), DL(DL), DT(DT), LI(LI), SD(SD), SE(SE), ORE(ORE) { DebugLoc Beg, End; auto P = getBBPairForRegion(R); getDebugLocations(P, Beg, End); std::string Msg = "SCoP begins here."; ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "ScopEntry", Beg, P.first) << Msg); buildScop(*R, AC); LLVM_DEBUG(dbgs() << *scop); if (!scop->hasFeasibleRuntimeContext()) { InfeasibleScops++; Msg = "SCoP ends here but was dismissed."; LLVM_DEBUG(dbgs() << "SCoP detected but dismissed\n"); scop.reset(); } else { Msg = "SCoP ends here."; ++ScopFound; if (scop->getMaxLoopDepth() > 0) ++RichScopFound; } if (R->isTopLevelRegion()) ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "ScopEnd", End, P.first) << Msg); else ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "ScopEnd", End, P.second) << Msg); }