This replaces patches that tried to convert related patterns to shuffles (D138872, D138873, D138874 - reverted/abandoned) but caused codegen problems and were questionable as a canonicalization because an insertelement is a simpler op than a shuffle. This detects a larger pattern -- insert-of-insert -- and replaces with another insert, so this hopefully does not cause any problems. As noted by TODO items in the code and tests, this could go a lot further. But this is enough to reduce the motivating test from issue #17113. Example proofs: https://alive2.llvm.org/ce/z/NnUv3a I drafted a version of this for AggressiveInstCombine, but it seems that would uncover yet another phase ordering gap. If we do generalize this to handle the full range of potential patterns, that may be worth looking at again. Differential Revision: https://reviews.llvm.org/D139668
184 lines
10 KiB
LLVM
184 lines
10 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -passes="default<O3>" -S < %s | FileCheck %s --check-prefix=SSE
|
|
; RUN: opt -passes="default<O3>" -S -mattr=avx < %s | FileCheck %s --check-prefix=AVX
|
|
|
|
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
|
|
target triple = "x86_64--"
|
|
|
|
%union.ElementWiseAccess = type { <4 x float> }
|
|
|
|
$getAt = comdat any
|
|
|
|
define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #0 {
|
|
; SSE-LABEL: @ConvertVectors_ByRef(
|
|
; SSE-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
|
|
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
|
|
; SSE-NEXT: ret <4 x float> [[TMP3]]
|
|
;
|
|
; AVX-LABEL: @ConvertVectors_ByRef(
|
|
; AVX-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[TMP0:%.*]], align 16
|
|
; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
|
|
; AVX-NEXT: ret <4 x float> [[TMP3]]
|
|
;
|
|
%2 = alloca ptr, align 8
|
|
%3 = alloca <4 x float>, align 16
|
|
store ptr %0, ptr %2, align 8
|
|
%4 = load ptr, ptr %2, align 8
|
|
%5 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %4)
|
|
%6 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %5, i32 noundef 0)
|
|
%7 = insertelement <4 x float> undef, float %6, i32 0
|
|
%8 = load ptr, ptr %2, align 8
|
|
%9 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %8)
|
|
%10 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %9, i32 noundef 1)
|
|
%11 = insertelement <4 x float> %7, float %10, i32 1
|
|
%12 = load ptr, ptr %2, align 8
|
|
%13 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %12)
|
|
%14 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %13, i32 noundef 2)
|
|
%15 = insertelement <4 x float> %11, float %14, i32 2
|
|
%16 = load ptr, ptr %2, align 8
|
|
%17 = call noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %16)
|
|
%18 = call noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %17, i32 noundef 2)
|
|
%19 = insertelement <4 x float> %15, float %18, i32 3
|
|
store <4 x float> %19, ptr %3, align 16
|
|
%20 = load <4 x float>, ptr %3, align 16
|
|
ret <4 x float> %20
|
|
}
|
|
|
|
define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
|
|
; SSE-LABEL: @ConvertVectors_ByVal(
|
|
; SSE-NEXT: entry:
|
|
; SSE-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
|
|
; SSE-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
|
|
; SSE-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
|
|
; SSE-NEXT: [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
|
|
; SSE-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
|
|
; SSE-NEXT: [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
|
|
; SSE-NEXT: [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
|
|
; SSE-NEXT: ret <4 x float> [[VECINIT16]]
|
|
;
|
|
; AVX-LABEL: @ConvertVectors_ByVal(
|
|
; AVX-NEXT: entry:
|
|
; AVX-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
|
|
; AVX-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
|
|
; AVX-NEXT: [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
|
|
; AVX-NEXT: [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
|
|
; AVX-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
|
|
; AVX-NEXT: [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
|
|
; AVX-NEXT: [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
|
|
; AVX-NEXT: ret <4 x float> [[VECINIT16]]
|
|
;
|
|
entry:
|
|
%V.addr = alloca ptr, align 8
|
|
%.compoundliteral = alloca <4 x float>, align 16
|
|
%ref.tmp = alloca %union.ElementWiseAccess, align 16
|
|
%ref.tmp2 = alloca %union.ElementWiseAccess, align 16
|
|
%ref.tmp7 = alloca %union.ElementWiseAccess, align 16
|
|
%ref.tmp12 = alloca %union.ElementWiseAccess, align 16
|
|
store ptr %V, ptr %V.addr, align 8
|
|
call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp) #4
|
|
%0 = load ptr, ptr %V.addr, align 8
|
|
%call = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %0)
|
|
%coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp, i32 0, i32 0
|
|
%1 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 0
|
|
%2 = extractvalue { double, double } %call, 0
|
|
store double %2, ptr %1, align 16
|
|
%3 = getelementptr inbounds { double, double }, ptr %coerce.dive, i32 0, i32 1
|
|
%4 = extractvalue { double, double } %call, 1
|
|
store double %4, ptr %3, align 8
|
|
%call1 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp, i32 noundef 0)
|
|
%vecinit = insertelement <4 x float> undef, float %call1, i32 0
|
|
call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp2) #4
|
|
%5 = load ptr, ptr %V.addr, align 8
|
|
%call3 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %5)
|
|
%coerce.dive4 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp2, i32 0, i32 0
|
|
%6 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 0
|
|
%7 = extractvalue { double, double } %call3, 0
|
|
store double %7, ptr %6, align 16
|
|
%8 = getelementptr inbounds { double, double }, ptr %coerce.dive4, i32 0, i32 1
|
|
%9 = extractvalue { double, double } %call3, 1
|
|
store double %9, ptr %8, align 8
|
|
%call5 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp2, i32 noundef 1)
|
|
%vecinit6 = insertelement <4 x float> %vecinit, float %call5, i32 1
|
|
call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp7) #4
|
|
%10 = load ptr, ptr %V.addr, align 8
|
|
%call8 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %10)
|
|
%coerce.dive9 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp7, i32 0, i32 0
|
|
%11 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 0
|
|
%12 = extractvalue { double, double } %call8, 0
|
|
store double %12, ptr %11, align 16
|
|
%13 = getelementptr inbounds { double, double }, ptr %coerce.dive9, i32 0, i32 1
|
|
%14 = extractvalue { double, double } %call8, 1
|
|
store double %14, ptr %13, align 8
|
|
%call10 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp7, i32 noundef 2)
|
|
%vecinit11 = insertelement <4 x float> %vecinit6, float %call10, i32 2
|
|
call void @llvm.lifetime.start.p0(i64 16, ptr %ref.tmp12) #4
|
|
%15 = load ptr, ptr %V.addr, align 8
|
|
%call13 = call { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %15)
|
|
%coerce.dive14 = getelementptr inbounds %union.ElementWiseAccess, ptr %ref.tmp12, i32 0, i32 0
|
|
%16 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 0
|
|
%17 = extractvalue { double, double } %call13, 0
|
|
store double %17, ptr %16, align 16
|
|
%18 = getelementptr inbounds { double, double }, ptr %coerce.dive14, i32 0, i32 1
|
|
%19 = extractvalue { double, double } %call13, 1
|
|
store double %19, ptr %18, align 8
|
|
%call15 = call noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %ref.tmp12, i32 noundef 2)
|
|
%vecinit16 = insertelement <4 x float> %vecinit11, float %call15, i32 3
|
|
store <4 x float> %vecinit16, ptr %.compoundliteral, align 16
|
|
%20 = load <4 x float>, ptr %.compoundliteral, align 16
|
|
call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp12) #4
|
|
call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp7) #4
|
|
call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp2) #4
|
|
call void @llvm.lifetime.end.p0(i64 16, ptr %ref.tmp) #4
|
|
ret <4 x float> %20
|
|
}
|
|
|
|
define internal { double, double } @castToElementWiseAccess_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %t) #1 {
|
|
entry:
|
|
%retval = alloca %union.ElementWiseAccess, align 16
|
|
%t.addr = alloca ptr, align 8
|
|
store ptr %t, ptr %t.addr, align 8
|
|
%0 = load ptr, ptr %t.addr, align 8
|
|
call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval, ptr align 16 %0, i64 16, i1 false)
|
|
%coerce.dive = getelementptr inbounds %union.ElementWiseAccess, ptr %retval, i32 0, i32 0
|
|
%1 = load { double, double }, ptr %coerce.dive, align 16
|
|
ret { double, double } %1
|
|
}
|
|
|
|
declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
|
|
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
|
|
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #3
|
|
|
|
define internal noundef nonnull align 16 dereferenceable(16) ptr @castToElementWiseAccess_ByRef(ptr noundef nonnull align 16 dereferenceable(16) %0) #1 {
|
|
%2 = alloca ptr, align 8
|
|
store ptr %0, ptr %2, align 8
|
|
%3 = load ptr, ptr %2, align 8
|
|
ret ptr %3
|
|
}
|
|
|
|
define linkonce_odr dso_local noundef float @getAt(ptr noundef nonnull align 16 dereferenceable(16) %0, i32 noundef %1) #1 comdat align 2 {
|
|
%3 = alloca ptr, align 8
|
|
%4 = alloca i32, align 4
|
|
store ptr %0, ptr %3, align 8
|
|
store i32 %1, ptr %4, align 4
|
|
%5 = load ptr, ptr %3, align 8
|
|
%6 = load i32, ptr %4, align 4
|
|
%7 = sext i32 %6 to i64
|
|
%8 = getelementptr inbounds [4 x float], ptr %5, i64 0, i64 %7
|
|
%9 = load float, ptr %8, align 4
|
|
ret float %9
|
|
}
|
|
|
|
define linkonce_odr noundef float @ElementWiseAccess5getAt(ptr noundef nonnull align 16 dereferenceable(16) %this, i32 noundef %i) #1 align 2 {
|
|
entry:
|
|
%this.addr = alloca ptr, align 8
|
|
%i.addr = alloca i32, align 4
|
|
store ptr %this, ptr %this.addr, align 8
|
|
store i32 %i, ptr %i.addr, align 4
|
|
%this1 = load ptr, ptr %this.addr, align 8
|
|
%0 = load i32, ptr %i.addr, align 4
|
|
%idxprom = sext i32 %0 to i64
|
|
%arrayidx = getelementptr inbounds [4 x float], ptr %this1, i64 0, i64 %idxprom
|
|
%1 = load float, ptr %arrayidx, align 4
|
|
ret float %1
|
|
}
|