diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 940d18a17b24..571e2692cbff 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -172,6 +172,21 @@ def form_duplane : GICombineRule < (apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }]) >; +// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16 +class unmerge_duplane : GICombineRule < + (defs root:$root), + (match (Op $a, $src, $c), + (G_UNMERGE_VALUES $d1, $d2, $a):$root, + [{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]), + (apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c)) +>; +def unmerge_duplane8 : unmerge_duplane; +def unmerge_duplane16 : unmerge_duplane; +def unmerge_duplane32 : unmerge_duplane; +// G_DUPLANE64 is not included as the result in scalar. +def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16, + unmerge_duplane32]>; + def shuffle_vector_lowering : GICombineGroup<[dup, form_duplane, rev, ext, zip, uzp, trn, fullrev, shuf_to_ins]>; @@ -325,7 +340,8 @@ def AArch64PostLegalizerLowering lower_vector_fcmp, form_truncstore, vector_sext_inreg_to_shift, unmerge_ext_to_unmerge, lower_mulv2s64, - vector_unmerge_lowering, insertelt_nonconst]> { + vector_unmerge_lowering, insertelt_nonconst, + unmerge_duplanes]> { } // Post-legalization combines which are primarily optimizations. diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir new file mode 100644 index 000000000000..acfbec0dd0ef --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-unmergedup.mir @@ -0,0 +1,91 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-lowering -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: unmerge_dup8 +legalized: true +body: | + bb.1.entry: + ; CHECK-LABEL: name: unmerge_dup8 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[DUPLANE8_:%[0-9]+]]:_(<8 x s8>) = G_DUPLANE8 [[COPY]], [[C]](s64) + ; CHECK-NEXT: $d0 = COPY [[DUPLANE8_]](<8 x s8>) + ; CHECK-NEXT: $d1 = COPY [[DUPLANE8_]](<8 x s8>) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(<16 x s8>) = COPY $q0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(<16 x s8>) = G_DUPLANE8 %0, %1 + %3:_(<8 x s8>), %4:_(<8 x s8>) = G_UNMERGE_VALUES %2 + $d0 = COPY %3 + $d1 = COPY %4 + RET_ReallyLR implicit $x0 + +... +--- +name: unmerge_dup16 +legalized: true +body: | + bb.1.entry: + ; CHECK-LABEL: name: unmerge_dup16 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[DUPLANE16_:%[0-9]+]]:_(<4 x s16>) = G_DUPLANE16 [[COPY]], [[C]](s64) + ; CHECK-NEXT: $d0 = COPY [[DUPLANE16_]](<4 x s16>) + ; CHECK-NEXT: $d1 = COPY [[DUPLANE16_]](<4 x s16>) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(<8 x s16>) = COPY $q0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(<8 x s16>) = G_DUPLANE16 %0, %1 + %3:_(<4 x s16>), %4:_(<4 x s16>) = G_UNMERGE_VALUES %2 + $d0 = COPY %3 + $d1 = COPY %4 + RET_ReallyLR implicit $x0 + +... +--- +name: unmerge_dup32 +legalized: true +body: | + bb.1.entry: + ; CHECK-LABEL: name: unmerge_dup32 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[DUPLANE32_:%[0-9]+]]:_(<2 x s32>) = G_DUPLANE32 [[COPY]], [[C]](s64) + ; CHECK-NEXT: $d0 = COPY [[DUPLANE32_]](<2 x s32>) + ; CHECK-NEXT: $d1 = COPY [[DUPLANE32_]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(<4 x s32>) = G_DUPLANE32 %0, %1 + %3:_(<2 x s32>), %4:_(<2 x s32>) = G_UNMERGE_VALUES %2 + $d0 = COPY %3 + $d1 = COPY %4 + RET_ReallyLR implicit $x0 + +... +--- +name: unmerge_dup64 +legalized: true +body: | + bb.1.entry: + ; CHECK-LABEL: name: unmerge_dup64 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[DUPLANE64_:%[0-9]+]]:_(<2 x s64>) = G_DUPLANE64 [[COPY]], [[C]](s64) + ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0 + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C1]](s64) + ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C2]](s64) + ; CHECK-NEXT: $d0 = COPY [[EVEC]](s64) + ; CHECK-NEXT: $d1 = COPY [[EVEC1]](s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %0:_(<2 x s64>) = COPY $q0 + %1:_(s64) = G_CONSTANT i64 1 + %2:_(<2 x s64>) = G_DUPLANE64 %0, %1 + %3:_(s64), %4:_(s64) = G_UNMERGE_VALUES %2 + $d0 = COPY %3 + $d1 = COPY %4 + RET_ReallyLR implicit $x0 + +... diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll index c279cf0f241d..49fb6c98e223 100644 --- a/llvm/test/CodeGen/AArch64/arm64-dup.ll +++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll @@ -401,16 +401,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) { ; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing ; the formation of an indexed-by-7 MLS. define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 { -; CHECK-SD-LABEL: test_high_splat: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mls.4h v0, v1, v2[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_high_splat: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup.8h v2, v2[7] -; CHECK-GI-NEXT: mls.4h v0, v2, v1 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_high_splat: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mls.4h v0, v1, v2[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll index c3ad3b4192cf..85d8b7c3e286 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -159,16 +159,10 @@ entry: } define <4 x i16> @test_vmla_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmla_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mla v0.4h, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmla_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: mla v0.4h, v2.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmla_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -189,16 +183,10 @@ entry: } define <2 x i32> @test_vmla_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmla_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mla v0.2s, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmla_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: mla v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmla_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -271,16 +259,10 @@ entry: } define <4 x i16> @test_vmls_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmls_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmls_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: mls v0.4h, v2.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmls_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %b @@ -301,16 +283,10 @@ entry: } define <2 x i32> @test_vmls_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmls_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmls_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmls_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %b @@ -427,16 +403,10 @@ entry: } define <4 x i16> @test_vmul_laneq_s16(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: mul v0.4h, v1.4h, v0.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -455,16 +425,10 @@ entry: } define <2 x i32> @test_vmul_laneq_s32(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -483,16 +447,10 @@ entry: } define <4 x i16> @test_vmul_laneq_u16(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_u16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_u16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: mul v0.4h, v1.4h, v0.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_u16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %mul = mul <4 x i16> %shuffle, %a @@ -511,16 +469,10 @@ entry: } define <2 x i32> @test_vmul_laneq_u32(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_u32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_u32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %mul = mul <2 x i32> %shuffle, %a @@ -567,16 +519,10 @@ entry: declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) { -; CHECK-SD-LABEL: test_vfma_laneq_f32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmla v0.2s, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vfma_laneq_f32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: fmla v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vfma_laneq_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -834,16 +780,10 @@ entry: } define <4 x i32> @test_vmlal_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlal v0.4s, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -852,16 +792,10 @@ entry: } define <2 x i64> @test_vmlal_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlal v0.2d, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -920,8 +854,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -940,8 +873,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -978,16 +910,10 @@ entry: } define <4 x i32> @test_vmlsl_laneq_s16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlsl v0.4s, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -996,16 +922,10 @@ entry: } define <2 x i64> @test_vmlsl_laneq_s32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlsl v0.2d, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1064,8 +984,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -1084,8 +1003,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -1122,16 +1040,10 @@ entry: } define <4 x i32> @test_vmlal_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_u16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlal v0.4s, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_u16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_u16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1140,16 +1052,10 @@ entry: } define <2 x i64> @test_vmlal_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_u32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlal v0.2d, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_u32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1208,8 +1114,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -1228,8 +1133,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -1266,16 +1170,10 @@ entry: } define <4 x i32> @test_vmlsl_laneq_u16(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_u16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlsl v0.4s, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_u16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_u16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -1284,16 +1182,10 @@ entry: } define <2 x i64> @test_vmlsl_laneq_u32(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_u32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlsl v0.2d, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_u32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -1352,8 +1244,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16(<4 x i32> %a, <8 x i16> %b, <8 x i16 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -1372,8 +1263,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32(<2 x i64> %a, <4 x i32> %b, <4 x i32 ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -1512,16 +1402,10 @@ entry: } define <4 x i32> @test_vmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1529,16 +1413,10 @@ entry: } define <2 x i64> @test_vmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smull v0.2d, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1546,16 +1424,10 @@ entry: } define <4 x i32> @test_vmull_laneq_u16(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_u16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_u16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_u16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[7] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1563,16 +1435,10 @@ entry: } define <2 x i64> @test_vmull_laneq_u32(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_u32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umull v0.2d, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_u32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_u32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1588,8 +1454,7 @@ define <4 x i32> @test_vmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -1607,8 +1472,7 @@ define <2 x i64> @test_vmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -1626,8 +1490,7 @@ define <4 x i32> @test_vmull_high_laneq_u16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_u16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -1645,8 +1508,7 @@ define <2 x i64> @test_vmull_high_laneq_u32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_u32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -1816,16 +1678,10 @@ entry: } define <4 x i32> @test_vqdmull_laneq_s16(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vqdmull_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.h[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqdmull_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[3] -; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqdmull_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -1833,16 +1689,10 @@ entry: } define <2 x i64> @test_vqdmull_laneq_s32(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vqdmull_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqdmull v0.2d, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqdmull_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqdmull_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -1898,8 +1748,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16(<8 x i16> %a, <8 x i16> %v) { ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.8h, v1.h[7] -; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[7] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -1917,8 +1766,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32(<4 x i32> %a, <4 x i32> %v) { ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[3] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -2322,16 +2170,10 @@ entry: } define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_f32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_f32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: fmul v0.2s, v1.2s, v0.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %mul = fmul <2 x float> %shuffle, %a @@ -2553,16 +2395,10 @@ entry: } define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) { -; CHECK-SD-LABEL: test_vmulx_laneq_f32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmulx v0.2s, v0.2s, v1.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmulx_laneq_f32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[3] -; CHECK-GI-NEXT: fmulx v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmulx_laneq_f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx v0.2s, v0.2s, v1.s[3] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) @@ -2657,16 +2493,10 @@ entry: } define <4 x i16> @test_vmla_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmla_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mla v0.4h, v1.4h, v2.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmla_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: mla v0.4h, v2.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmla_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -2687,16 +2517,10 @@ entry: } define <2 x i32> @test_vmla_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmla_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mla v0.2s, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmla_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: mla v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmla_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mla v0.2s, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -2769,16 +2593,10 @@ entry: } define <4 x i16> @test_vmls_laneq_s16_0(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmls_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mls v0.4h, v1.4h, v2.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmls_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: mls v0.4h, v2.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmls_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mls v0.4h, v1.4h, v2.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %b @@ -2799,16 +2617,10 @@ entry: } define <2 x i32> @test_vmls_laneq_s32_0(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmls_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mls v0.2s, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmls_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: mls v0.2s, v2.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmls_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mls v0.2s, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %b @@ -2925,16 +2737,10 @@ entry: } define <4 x i16> @test_vmul_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: mul v0.4h, v1.4h, v0.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -2953,16 +2759,10 @@ entry: } define <2 x i32> @test_vmul_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -2981,16 +2781,10 @@ entry: } define <4 x i16> @test_vmul_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_u16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.4h, v0.4h, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_u16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: mul v0.4h, v1.4h, v0.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_u16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.4h, v0.4h, v1.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %mul = mul <4 x i16> %shuffle, %a @@ -3009,16 +2803,10 @@ entry: } define <2 x i32> @test_vmul_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_u32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mul v0.2s, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_u32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: mul v0.2s, v1.2s, v0.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_u32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mul v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %mul = mul <2 x i32> %shuffle, %a @@ -3061,16 +2849,10 @@ entry: } define <2 x float> @test_vfma_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) { -; CHECK-SD-LABEL: test_vfma_laneq_f32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmla v0.2s, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vfma_laneq_f32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: fmla v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vfma_laneq_f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmla v0.2s, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) @@ -3188,16 +2970,10 @@ entry: } define <4 x i32> @test_vmlal_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlal v0.4s, v1.4h, v2.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlal v0.4s, v1.4h, v2.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -3206,16 +2982,10 @@ entry: } define <2 x i64> @test_vmlal_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlal v0.2d, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlal v0.2d, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -3274,8 +3044,7 @@ define <4 x i32> @test_vmlal_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; CHECK-GI-LABEL: test_vmlal_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: smlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -3294,8 +3063,7 @@ define <2 x i64> @test_vmlal_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; CHECK-GI-LABEL: test_vmlal_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -3332,16 +3100,10 @@ entry: } define <4 x i32> @test_vmlsl_laneq_s16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlsl v0.4s, v1.4h, v2.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlsl v0.4s, v1.4h, v2.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -3350,16 +3112,10 @@ entry: } define <2 x i64> @test_vmlsl_laneq_s32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smlsl v0.2d, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smlsl v0.2d, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -3418,8 +3174,7 @@ define <4 x i32> @test_vmlsl_high_laneq_s16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: smlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -3438,8 +3193,7 @@ define <2 x i64> @test_vmlsl_high_laneq_s32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; CHECK-GI-LABEL: test_vmlsl_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: smlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -3476,16 +3230,10 @@ entry: } define <4 x i32> @test_vmlal_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_u16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlal v0.4s, v1.4h, v2.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_u16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_u16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlal v0.4s, v1.4h, v2.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -3494,16 +3242,10 @@ entry: } define <2 x i64> @test_vmlal_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlal_laneq_u32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlal v0.2d, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlal_laneq_u32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlal_laneq_u32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlal v0.2d, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -3562,8 +3304,7 @@ define <4 x i32> @test_vmlal_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; CHECK-GI-LABEL: test_vmlal_high_laneq_u16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: umlal v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -3582,8 +3323,7 @@ define <2 x i64> @test_vmlal_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; CHECK-GI-LABEL: test_vmlal_high_laneq_u32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: umlal v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -3620,16 +3360,10 @@ entry: } define <4 x i32> @test_vmlsl_laneq_u16_0(<4 x i32> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_u16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlsl v0.4s, v1.4h, v2.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_u16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_u16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlsl v0.4s, v1.4h, v2.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %b, <4 x i16> %shuffle) @@ -3638,16 +3372,10 @@ entry: } define <2 x i64> @test_vmlsl_laneq_u32_0(<2 x i64> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmlsl_laneq_u32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umlsl v0.2d, v1.2s, v2.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmlsl_laneq_u32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmlsl_laneq_u32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umlsl v0.2d, v1.2s, v2.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %b, <2 x i32> %shuffle) @@ -3706,8 +3434,7 @@ define <4 x i32> @test_vmlsl_high_laneq_u16_0(<4 x i32> %a, <8 x i16> %b, <8 x i ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.8h, v2.h[0] -; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.4h +; CHECK-GI-NEXT: umlsl v0.4s, v1.4h, v2.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> @@ -3726,8 +3453,7 @@ define <2 x i64> @test_vmlsl_high_laneq_u32_0(<2 x i64> %a, <4 x i32> %b, <4 x i ; CHECK-GI-LABEL: test_vmlsl_high_laneq_u32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d1, v1.d[1] -; CHECK-GI-NEXT: dup v2.4s, v2.s[0] -; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.2s +; CHECK-GI-NEXT: umlsl v0.2d, v1.2s, v2.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> @@ -3866,16 +3592,10 @@ entry: } define <4 x i32> @test_vmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smull v0.4s, v0.4h, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull v0.4s, v0.4h, v1.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -3883,16 +3603,10 @@ entry: } define <2 x i64> @test_vmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: smull v0.2d, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: smull v0.2d, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -3900,16 +3614,10 @@ entry: } define <4 x i32> @test_vmull_laneq_u16_0(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_u16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umull v0.4s, v0.4h, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_u16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_u16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull v0.4s, v0.4h, v1.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -3917,16 +3625,10 @@ entry: } define <2 x i64> @test_vmull_laneq_u32_0(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vmull_laneq_u32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: umull v0.2d, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmull_laneq_u32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmull_laneq_u32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: umull v0.2d, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -3942,8 +3644,7 @@ define <4 x i32> @test_vmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: smull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -3961,8 +3662,7 @@ define <2 x i64> @test_vmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: smull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -3980,8 +3680,7 @@ define <4 x i32> @test_vmull_high_laneq_u16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_u16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: umull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -3999,8 +3698,7 @@ define <2 x i64> @test_vmull_high_laneq_u32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-GI-LABEL: test_vmull_high_laneq_u32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: umull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -4170,16 +3868,10 @@ entry: } define <4 x i32> @test_vqdmull_laneq_s16_0(<4 x i16> %a, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vqdmull_laneq_s16_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqdmull v0.4s, v0.4h, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqdmull_laneq_s16_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqdmull_laneq_s16_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmull v0.4s, v0.4h, v1.h[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> zeroinitializer %vqdmull2.i = tail call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) @@ -4187,16 +3879,10 @@ entry: } define <2 x i64> @test_vqdmull_laneq_s32_0(<2 x i32> %a, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vqdmull_laneq_s32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqdmull_laneq_s32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqdmull_laneq_s32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <2 x i32> zeroinitializer %vqdmull2.i = tail call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) @@ -4252,8 +3938,7 @@ define <4 x i32> @test_vqdmull_high_laneq_s16_0(<8 x i16> %a, <8 x i16> %v) { ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s16_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.8h, v1.h[0] -; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.4h +; CHECK-GI-NEXT: sqdmull v0.4s, v0.4h, v1.h[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> @@ -4271,8 +3956,7 @@ define <2 x i64> @test_vqdmull_high_laneq_s32_0(<4 x i32> %a, <4 x i32> %v) { ; CHECK-GI-LABEL: test_vqdmull_high_laneq_s32_0: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov d0, v0.d[1] -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.2s +; CHECK-GI-NEXT: sqdmull v0.2d, v0.2s, v1.s[0] ; CHECK-GI-NEXT: ret entry: %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> @@ -4402,16 +4086,10 @@ entry: } define <2 x float> @test_vmul_laneq_f32_0(<2 x float> %a, <4 x float> %v) { -; CHECK-SD-LABEL: test_vmul_laneq_f32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmul v0.2s, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmul_laneq_f32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: fmul v0.2s, v1.2s, v0.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmul_laneq_f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmul v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %mul = fmul <2 x float> %shuffle, %a @@ -4498,16 +4176,10 @@ entry: } define <2 x float> @test_vmulx_laneq_f32_0(<2 x float> %a, <4 x float> %v) { -; CHECK-SD-LABEL: test_vmulx_laneq_f32_0: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmulx v0.2s, v0.2s, v1.s[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vmulx_laneq_f32_0: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v1.4s, v1.s[0] -; CHECK-GI-NEXT: fmulx v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vmulx_laneq_f32_0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmulx v0.2s, v0.2s, v1.s[0] +; CHECK-NEXT: ret entry: %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> zeroinitializer %vmulx2.i = tail call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> %a, <2 x float> %shuffle) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index ddd8a72618b1..367105f78381 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -807,46 +807,28 @@ define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 { } define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 { -; CHECK-SD-LABEL: test_vdup_laneq_s8: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: dup v0.8b, v0.b[5] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vdup_laneq_s8: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.16b, v0.b[5] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vdup_laneq_s8: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.8b, v0.b[5] +; CHECK-NEXT: ret %shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> ret <8 x i8> %shuffle } define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 { -; CHECK-SD-LABEL: test_vdup_laneq_s16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: dup v0.4h, v0.h[2] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vdup_laneq_s16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.8h, v0.h[2] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vdup_laneq_s16: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4h, v0.h[2] +; CHECK-NEXT: ret %shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> ret <4 x i16> %shuffle } define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 { -; CHECK-SD-LABEL: test_vdup_laneq_s32: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: dup v0.2s, v0.s[1] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vdup_laneq_s32: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: dup v0.4s, v0.s[1] -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vdup_laneq_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.2s, v0.s[1] +; CHECK-NEXT: ret %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> ret <2 x i32> %shuffle } diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll index bb97ba6d9265..cb14adc00df0 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -569,16 +569,10 @@ define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) { ; Using sqrdmlah intrinsics define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vqrdmlah_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqrdmlah_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: sqrdmlah v0.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqrdmlah_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> %vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4 @@ -586,16 +580,10 @@ entry: } define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vqrdmlah_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqrdmlah_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: sqrdmlah v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqrdmlah_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> %vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4 @@ -700,22 +688,13 @@ entry: } define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) { -; CHECK-SD-LABEL: test_vqrdmlahh_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmov s1, w0 -; CHECK-SD-NEXT: fmov s2, w1 -; CHECK-SD-NEXT: sqrdmlah v1.4h, v2.4h, v0.h[7] -; CHECK-SD-NEXT: umov w0, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqrdmlahh_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v0.8h, v0.h[7] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s2, w1 -; CHECK-GI-NEXT: sqrdmlah v1.4h, v2.4h, v0.4h -; CHECK-GI-NEXT: umov w0, v1.h[0] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqrdmlahh_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: sqrdmlah v1.4h, v2.4h, v0.h[7] +; CHECK-NEXT: umov w0, v1.h[0] +; CHECK-NEXT: ret entry: %0 = insertelement <4 x i16> undef, i16 %a, i64 0 %1 = insertelement <4 x i16> undef, i16 %b, i64 0 @@ -740,16 +719,10 @@ entry: } define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) { -; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.8h, v2.h[7] -; CHECK-GI-NEXT: sqrdmlsh v0.4h, v1.4h, v2.4h -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqrdmlsh_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7] +; CHECK-NEXT: ret entry: %lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> %vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4 @@ -757,16 +730,10 @@ entry: } define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) { -; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v2.4s, v2.s[3] -; CHECK-GI-NEXT: sqrdmlsh v0.2s, v1.2s, v2.2s -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqrdmlsh_laneq_s32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3] +; CHECK-NEXT: ret entry: %lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> %vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4 @@ -871,22 +838,13 @@ entry: } define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) { -; CHECK-SD-LABEL: test_vqrdmlshh_laneq_s16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: fmov s1, w0 -; CHECK-SD-NEXT: fmov s2, w1 -; CHECK-SD-NEXT: sqrdmlsh v1.4h, v2.4h, v0.h[7] -; CHECK-SD-NEXT: umov w0, v1.h[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_vqrdmlshh_laneq_s16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: dup v0.8h, v0.h[7] -; CHECK-GI-NEXT: fmov s1, w0 -; CHECK-GI-NEXT: fmov s2, w1 -; CHECK-GI-NEXT: sqrdmlsh v1.4h, v2.4h, v0.4h -; CHECK-GI-NEXT: umov w0, v1.h[0] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_vqrdmlshh_laneq_s16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fmov s1, w0 +; CHECK-NEXT: fmov s2, w1 +; CHECK-NEXT: sqrdmlsh v1.4h, v2.4h, v0.h[7] +; CHECK-NEXT: umov w0, v1.h[0] +; CHECK-NEXT: ret entry: %0 = insertelement <4 x i16> undef, i16 %a, i64 0 %1 = insertelement <4 x i16> undef, i16 %b, i64 0