The initial placement of vector-combine in the opt pipeline revealed phase ordering bugs: https://bugs.llvm.org/show_bug.cgi?id=45015 https://bugs.llvm.org/show_bug.cgi?id=42022 This patch contains a few independent changes: 1. Move the pass up in the pipeline, so it happens just after loop-vectorization. This is only to keep vectorization passes together in the pipeline at the moment. I don't have evidence of interaction between these yet. 2. Add an -early-cse pass after -vector-combine to clean up redundant ops. This was partly proposed as far back as rL219644 (which is why it's effectively being moved in the old PM code). This is important because the subsequent -instcombine doesn't work as well without EarlyCSE. With the CSE, -instcombine is able to squash shuffles together in 1 of the tests (because those are simple "select" shuffles). 3. Remove the -vector-combine pass that was running after SLP. We may want to do that eventually, but I don't have a test case to support it yet. Differential Revision: https://reviews.llvm.org/D75145
110 lines
5.7 KiB
LLVM
110 lines
5.7 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mattr=avx | FileCheck %s
|
|
; RUN: opt -passes='default<O3>' -S < %s | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
|
|
|
|
; Ideally, this should reach the backend with 1 fsub, 1 fadd, and 1 shuffle.
|
|
; That may require some coordination between VectorCombine, SLP, and other passes.
|
|
; The end goal is to get a single "vaddsubps" instruction for x86 with AVX.
|
|
|
|
define <4 x float> @PR45015(<4 x float> %arg, <4 x float> %arg1) {
|
|
; CHECK-LABEL: @PR45015(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[ARG:%.*]], [[ARG1:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[ARG]], [[ARG1]]
|
|
; CHECK-NEXT: [[T16:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
|
|
; CHECK-NEXT: ret <4 x float> [[T16]]
|
|
;
|
|
%t = extractelement <4 x float> %arg, i32 0
|
|
%t2 = extractelement <4 x float> %arg1, i32 0
|
|
%t3 = fsub float %t, %t2
|
|
%t4 = insertelement <4 x float> undef, float %t3, i32 0
|
|
%t5 = extractelement <4 x float> %arg, i32 1
|
|
%t6 = extractelement <4 x float> %arg1, i32 1
|
|
%t7 = fadd float %t5, %t6
|
|
%t8 = insertelement <4 x float> %t4, float %t7, i32 1
|
|
%t9 = extractelement <4 x float> %arg, i32 2
|
|
%t10 = extractelement <4 x float> %arg1, i32 2
|
|
%t11 = fsub float %t9, %t10
|
|
%t12 = insertelement <4 x float> %t8, float %t11, i32 2
|
|
%t13 = extractelement <4 x float> %arg, i32 3
|
|
%t14 = extractelement <4 x float> %arg1, i32 3
|
|
%t15 = fadd float %t13, %t14
|
|
%t16 = insertelement <4 x float> %t12, float %t15, i32 3
|
|
ret <4 x float> %t16
|
|
}
|
|
|
|
; PR42022 - https://bugs.llvm.org/show_bug.cgi?id=42022
|
|
|
|
%struct.Vector4 = type { float, float, float, float }
|
|
|
|
define { <2 x float>, <2 x float> } @add_aggregate(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1) {
|
|
; CHECK-LABEL: @add_aggregate(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
|
|
; CHECK-NEXT: [[FCA_0_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> [[TMP1]], 0
|
|
; CHECK-NEXT: [[FCA_1_INSERT:%.*]] = insertvalue { <2 x float>, <2 x float> } [[FCA_0_INSERT]], <2 x float> [[TMP2]], 1
|
|
; CHECK-NEXT: ret { <2 x float>, <2 x float> } [[FCA_1_INSERT]]
|
|
;
|
|
%a00 = extractelement <2 x float> %a0, i32 0
|
|
%b00 = extractelement <2 x float> %b0, i32 0
|
|
%add = fadd float %a00, %b00
|
|
%retval.0.0.insert = insertelement <2 x float> undef, float %add, i32 0
|
|
%a01 = extractelement <2 x float> %a0, i32 1
|
|
%b01 = extractelement <2 x float> %b0, i32 1
|
|
%add4 = fadd float %a01, %b01
|
|
%retval.0.1.insert = insertelement <2 x float> %retval.0.0.insert, float %add4, i32 1
|
|
%a10 = extractelement <2 x float> %a1, i32 0
|
|
%b10 = extractelement <2 x float> %b1, i32 0
|
|
%add7 = fadd float %a10, %b10
|
|
%retval.1.0.insert = insertelement <2 x float> undef, float %add7, i32 0
|
|
%a11 = extractelement <2 x float> %a1, i32 1
|
|
%b11 = extractelement <2 x float> %b1, i32 1
|
|
%add10 = fadd float %a11, %b11
|
|
%retval.1.1.insert = insertelement <2 x float> %retval.1.0.insert, float %add10, i32 1
|
|
%fca.0.insert = insertvalue { <2 x float>, <2 x float> } undef, <2 x float> %retval.0.1.insert, 0
|
|
%fca.1.insert = insertvalue { <2 x float>, <2 x float> } %fca.0.insert, <2 x float> %retval.1.1.insert, 1
|
|
ret { <2 x float>, <2 x float> } %fca.1.insert
|
|
}
|
|
|
|
define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %b0, <2 x float> %b1, %struct.Vector4* nocapture dereferenceable(16) %r) {
|
|
; CHECK-LABEL: @add_aggregate_store(
|
|
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> [[A0:%.*]], [[B0:%.*]]
|
|
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
|
|
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4:%.*]], %struct.Vector4* [[R:%.*]], i64 0, i32 0
|
|
; CHECK-NEXT: store float [[TMP2]], float* [[R0]], align 4
|
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
|
|
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 1
|
|
; CHECK-NEXT: store float [[TMP3]], float* [[R1]], align 4
|
|
; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[A1:%.*]], [[B1:%.*]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
|
|
; CHECK-NEXT: [[R2:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 2
|
|
; CHECK-NEXT: store float [[TMP5]], float* [[R2]], align 4
|
|
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
|
|
; CHECK-NEXT: [[R3:%.*]] = getelementptr inbounds [[STRUCT_VECTOR4]], %struct.Vector4* [[R]], i64 0, i32 3
|
|
; CHECK-NEXT: store float [[TMP6]], float* [[R3]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%a00 = extractelement <2 x float> %a0, i32 0
|
|
%b00 = extractelement <2 x float> %b0, i32 0
|
|
%add = fadd float %a00, %b00
|
|
%r0 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 0
|
|
store float %add, float* %r0, align 4
|
|
%a01 = extractelement <2 x float> %a0, i32 1
|
|
%b01 = extractelement <2 x float> %b0, i32 1
|
|
%add4 = fadd float %a01, %b01
|
|
%r1 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 1
|
|
store float %add4, float* %r1, align 4
|
|
%a10 = extractelement <2 x float> %a1, i32 0
|
|
%b10 = extractelement <2 x float> %b1, i32 0
|
|
%add7 = fadd float %a10, %b10
|
|
%r2 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 2
|
|
store float %add7, float* %r2, align 4
|
|
%a11 = extractelement <2 x float> %a1, i32 1
|
|
%b11 = extractelement <2 x float> %b1, i32 1
|
|
%add10 = fadd float %a11, %b11
|
|
%r3 = getelementptr inbounds %struct.Vector4, %struct.Vector4* %r, i64 0, i32 3
|
|
store float %add10, float* %r3, align 4
|
|
ret void
|
|
}
|