diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 7ec9bdb318ff..f222a9905c3b 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -2148,10 +2148,6 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, "different type sizes\n"); return Dependence::Unknown; } - - if (!CommonStride) - return Dependence::Unknown; - // Bail out early if passed-in parameters make vectorization not feasible. unsigned ForcedFactor = (VectorizerParams::VectorizationFactor ? VectorizerParams::VectorizationFactor : 1); @@ -2162,7 +2158,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // It's not vectorizable if the distance is smaller than the minimum distance // needed for a vectroized/unrolled version. Vectorizing one iteration in - // front needs CommonStride. Vectorizing the last iteration needs TypeByteSize + // front needs MaxStride. Vectorizing the last iteration needs TypeByteSize. // (No need to plus the last gap distance). // // E.g. Assume one char is 1 byte in memory and one int is 4 bytes. @@ -2186,11 +2182,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, // If MinNumIter is 4 (Say if a user forces the vectorization factor to be 4), // the minimum distance needed is 28, which is greater than distance. It is // not safe to do vectorization. + // + // We use MaxStride (maximum of src and sink strides) to get a conservative + // lower bound on the MinDistanceNeeded in case of different strides. // We know that Dist is positive, but it may not be constant. Use the signed // minimum for computations below, as this ensures we compute the closest // possible dependence distance. - uint64_t MinDistanceNeeded = *CommonStride * (MinNumIter - 1) + TypeByteSize; + uint64_t MinDistanceNeeded = MaxStride * (MinNumIter - 1) + TypeByteSize; if (MinDistanceNeeded > static_cast(MinDistance)) { if (!ConstDist) { // For non-constant distances, we checked the lower bound of the @@ -2236,7 +2235,7 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx, couldPreventStoreLoadForward(MinDistance, TypeByteSize, *CommonStride)) return Dependence::BackwardVectorizableButPreventsForwarding; - uint64_t MaxVF = MinDepDistBytes / *CommonStride; + uint64_t MaxVF = MinDepDistBytes / MaxStride; LLVM_DEBUG(dbgs() << "LAA: Positive min distance " << MinDistance << " with max VF = " << MaxVF << '\n'); diff --git a/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll new file mode 100644 index 000000000000..c5f31de2f823 --- /dev/null +++ b/llvm/test/Analysis/LoopAccessAnalysis/different_strides.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes="print" %s 2>&1 | FileCheck %s + +@a = dso_local local_unnamed_addr global [65536 x float] zeroinitializer, align 16 + +; Generated from the following C code: +; #define LEN 256 * 256 +; float a[LEN]; +; +; void different_strides() { +; for (int i = 0; i < LEN - 1024 - 255; i++) { +; #pragma clang loop interleave(disable) +; #pragma clang loop unroll(disable) +; for (int j = 0; j < 256; j++) +; a[i + j + 1024] += a[j * 4 + i]; +; } +; } +; The load and store have different strides(4 and 16 bytes respectively) but the store +; is always at safe positive distance away from the load, thus BackwardVectorizable +define void @different_strides_backward_vectorizable() { +; CHECK-LABEL: 'different_strides_backward_vectorizable' +; CHECK-NEXT: inner.body: +; CHECK-NEXT: Memory dependences are safe with a maximum safe vector width of 2048 bits +; CHECK-NEXT: Dependences: +; CHECK-NEXT: BackwardVectorizable: +; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 -> +; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Forward: +; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 -> +; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: outer.header: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %outer.header + +outer.header: + %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ] + %0 = add nuw nsw i64 %i, 1024 + br label %inner.body + +inner.body: + %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ] + %1 = shl nuw nsw i64 %j, 2 + %2 = add nuw nsw i64 %1, %i + %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2 + %3 = load float, ptr %arrayidx, align 4 + %4 = add nuw nsw i64 %0, %j + %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd fast float %5, %3 + store float %add9, ptr %arrayidx8, align 4 + %j.next = add nuw nsw i64 %j, 1 + %exitcond.not = icmp eq i64 %j.next, 256 + br i1 %exitcond.not, label %outer.latch, label %inner.body + +outer.latch: + %i.next = add nuw nsw i64 %i, 1 + %outerexitcond.not = icmp eq i64 %i.next, 64257 + br i1 %outerexitcond.not, label %exit, label %outer.header + +exit: + ret void +} + + +; Generated from following C code: +; void different_stride_and_not_vectorizable(){ +; for(int i = 0; i < LEN2; i++){ +; for(int j = 0 ; j < LEN; j++){ +; a[i + j + LEN] += a[i + 4*j]; +; } +; } +; } +; The load and store have different strides, but the store and load are not at a +; safe distance away from each other, thus not safe for vectorization. +define void @different_stride_and_not_vectorizable() { +; CHECK-LABEL: 'different_stride_and_not_vectorizable' +; CHECK-NEXT: inner.body: +; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop +; CHECK-NEXT: Unknown data dependence. +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Unknown: +; CHECK-NEXT: %3 = load float, ptr %arrayidx, align 4 -> +; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Forward: +; CHECK-NEXT: %5 = load float, ptr %arrayidx8, align 4 -> +; CHECK-NEXT: store float %add9, ptr %arrayidx8, align 4 +; CHECK-EMPTY: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; CHECK-NEXT: outer.header: +; CHECK-NEXT: Report: loop is not the innermost loop +; CHECK-NEXT: Dependences: +; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Grouped accesses: +; CHECK-EMPTY: +; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. +; CHECK-NEXT: SCEV assumptions: +; CHECK-EMPTY: +; CHECK-NEXT: Expressions re-written: +; +entry: + br label %outer.header + +outer.header: + %i = phi i64 [ 0, %entry ], [ %i.next, %outer.latch ] + %0 = add nuw nsw i64 %i, 256 + br label %inner.body + +inner.body: + %j = phi i64 [ 0, %outer.header ], [ %j.next, %inner.body ] + %1 = shl nuw nsw i64 %j, 2 + %2 = add nuw nsw i64 %1, %i + %arrayidx = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %2 + %3 = load float, ptr %arrayidx, align 4 + %4 = add nuw nsw i64 %0, %j + %arrayidx8 = getelementptr inbounds [65536 x float], ptr @a, i64 0, i64 %4 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd fast float %5, %3 + store float %add9, ptr %arrayidx8, align 4 + %j.next = add nuw nsw i64 %j, 1 + %exitcond.not = icmp eq i64 %j.next, 256 + br i1 %exitcond.not, label %outer.latch, label %inner.body + +outer.latch: + %i.next = add nuw nsw i64 %i, 1 + %exitcond29.not = icmp eq i64 %i.next, 65536 + br i1 %exitcond29.not, label %exit, label %outer.header + +exit: + ret void +}