[AArch64][GlobalISel] Combine G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16 (#142731)

We will generate G_UNMERGE(G_DUPLANE16) due to the legalization of
shuffle vector splats with mismatching vector sizes. The G_DUPLANE
intrinsics can handle different vector sizes (128bit and 64bit output,
for example), and we can combine away the unmerge.
This commit is contained in:
David Green
2025-06-09 20:57:45 +01:00
committed by GitHub
parent 274f5a817b
commit 6cbd91ea52
6 changed files with 382 additions and 669 deletions

View File

@@ -172,6 +172,21 @@ def form_duplane : GICombineRule <
(apply [{ applyDupLane(*${root}, MRI, B, ${matchinfo}); }])
>;
// Clean up G_UNMERGE(G_DUPLANE16) -> G_DUPLANE16
class unmerge_duplane<Instruction Op> : GICombineRule <
(defs root:$root),
(match (Op $a, $src, $c),
(G_UNMERGE_VALUES $d1, $d2, $a):$root,
[{ return MRI.getType(${d1}.getReg()).getSizeInBits() == 64; }]),
(apply (GIReplaceReg $d2, $d1), (Op $d1, $src, $c))
>;
def unmerge_duplane8 : unmerge_duplane<G_DUPLANE8>;
def unmerge_duplane16 : unmerge_duplane<G_DUPLANE16>;
def unmerge_duplane32 : unmerge_duplane<G_DUPLANE32>;
// G_DUPLANE64 is not included as the result in scalar.
def unmerge_duplanes : GICombineGroup<[unmerge_duplane8, unmerge_duplane16,
unmerge_duplane32]>;
def shuffle_vector_lowering : GICombineGroup<[dup, form_duplane, rev, ext, zip,
uzp, trn, fullrev, shuf_to_ins]>;
@@ -325,7 +340,8 @@ def AArch64PostLegalizerLowering
lower_vector_fcmp, form_truncstore,
vector_sext_inreg_to_shift,
unmerge_ext_to_unmerge, lower_mulv2s64,
vector_unmerge_lowering, insertelt_nonconst]> {
vector_unmerge_lowering, insertelt_nonconst,
unmerge_duplanes]> {
}
// Post-legalization combines which are primarily optimizations.

View File

@@ -0,0 +1,91 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-lowering -verify-machineinstrs %s -o - | FileCheck %s
---
name: unmerge_dup8
legalized: true
body: |
bb.1.entry:
; CHECK-LABEL: name: unmerge_dup8
; CHECK: [[COPY:%[0-9]+]]:_(<16 x s8>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[DUPLANE8_:%[0-9]+]]:_(<8 x s8>) = G_DUPLANE8 [[COPY]], [[C]](s64)
; CHECK-NEXT: $d0 = COPY [[DUPLANE8_]](<8 x s8>)
; CHECK-NEXT: $d1 = COPY [[DUPLANE8_]](<8 x s8>)
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:_(<16 x s8>) = COPY $q0
%1:_(s64) = G_CONSTANT i64 1
%2:_(<16 x s8>) = G_DUPLANE8 %0, %1
%3:_(<8 x s8>), %4:_(<8 x s8>) = G_UNMERGE_VALUES %2
$d0 = COPY %3
$d1 = COPY %4
RET_ReallyLR implicit $x0
...
---
name: unmerge_dup16
legalized: true
body: |
bb.1.entry:
; CHECK-LABEL: name: unmerge_dup16
; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[DUPLANE16_:%[0-9]+]]:_(<4 x s16>) = G_DUPLANE16 [[COPY]], [[C]](s64)
; CHECK-NEXT: $d0 = COPY [[DUPLANE16_]](<4 x s16>)
; CHECK-NEXT: $d1 = COPY [[DUPLANE16_]](<4 x s16>)
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:_(<8 x s16>) = COPY $q0
%1:_(s64) = G_CONSTANT i64 1
%2:_(<8 x s16>) = G_DUPLANE16 %0, %1
%3:_(<4 x s16>), %4:_(<4 x s16>) = G_UNMERGE_VALUES %2
$d0 = COPY %3
$d1 = COPY %4
RET_ReallyLR implicit $x0
...
---
name: unmerge_dup32
legalized: true
body: |
bb.1.entry:
; CHECK-LABEL: name: unmerge_dup32
; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[DUPLANE32_:%[0-9]+]]:_(<2 x s32>) = G_DUPLANE32 [[COPY]], [[C]](s64)
; CHECK-NEXT: $d0 = COPY [[DUPLANE32_]](<2 x s32>)
; CHECK-NEXT: $d1 = COPY [[DUPLANE32_]](<2 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:_(<4 x s32>) = COPY $q0
%1:_(s64) = G_CONSTANT i64 1
%2:_(<4 x s32>) = G_DUPLANE32 %0, %1
%3:_(<2 x s32>), %4:_(<2 x s32>) = G_UNMERGE_VALUES %2
$d0 = COPY %3
$d1 = COPY %4
RET_ReallyLR implicit $x0
...
---
name: unmerge_dup64
legalized: true
body: |
bb.1.entry:
; CHECK-LABEL: name: unmerge_dup64
; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[DUPLANE64_:%[0-9]+]]:_(<2 x s64>) = G_DUPLANE64 [[COPY]], [[C]](s64)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C1]](s64)
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s64) = G_EXTRACT_VECTOR_ELT [[DUPLANE64_]](<2 x s64>), [[C2]](s64)
; CHECK-NEXT: $d0 = COPY [[EVEC]](s64)
; CHECK-NEXT: $d1 = COPY [[EVEC1]](s64)
; CHECK-NEXT: RET_ReallyLR implicit $x0
%0:_(<2 x s64>) = COPY $q0
%1:_(s64) = G_CONSTANT i64 1
%2:_(<2 x s64>) = G_DUPLANE64 %0, %1
%3:_(s64), %4:_(s64) = G_UNMERGE_VALUES %2
$d0 = COPY %3
$d1 = COPY %4
RET_ReallyLR implicit $x0
...

View File

@@ -401,16 +401,10 @@ define <4 x i16> @test_build_illegal(<4 x i32> %in) {
; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
; the formation of an indexed-by-7 MLS.
define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
; CHECK-SD-LABEL: test_high_splat:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mls.4h v0, v1, v2[7]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_high_splat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup.8h v2, v2[7]
; CHECK-GI-NEXT: mls.4h v0, v2, v1
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_high_splat:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mls.4h v0, v1, v2[7]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%mul = mul <4 x i16> %shuffle, %b

File diff suppressed because it is too large Load Diff

View File

@@ -807,46 +807,28 @@ define <2 x i64> @test_vdupq_lane_s64(<1 x i64> %v1) #0 {
}
define <8 x i8> @test_vdup_laneq_s8(<16 x i8> %v1) #0 {
; CHECK-SD-LABEL: test_vdup_laneq_s8:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: dup v0.8b, v0.b[5]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vdup_laneq_s8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v0.16b, v0.b[5]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vdup_laneq_s8:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.8b, v0.b[5]
; CHECK-NEXT: ret
%shuffle = shufflevector <16 x i8> %v1, <16 x i8> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
ret <8 x i8> %shuffle
}
define <4 x i16> @test_vdup_laneq_s16(<8 x i16> %v1) #0 {
; CHECK-SD-LABEL: test_vdup_laneq_s16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: dup v0.4h, v0.h[2]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vdup_laneq_s16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v0.8h, v0.h[2]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vdup_laneq_s16:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.4h, v0.h[2]
; CHECK-NEXT: ret
%shuffle = shufflevector <8 x i16> %v1, <8 x i16> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x i16> %shuffle
}
define <2 x i32> @test_vdup_laneq_s32(<4 x i32> %v1) #0 {
; CHECK-SD-LABEL: test_vdup_laneq_s32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: dup v0.2s, v0.s[1]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vdup_laneq_s32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v0.4s, v0.s[1]
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vdup_laneq_s32:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v0.2s, v0.s[1]
; CHECK-NEXT: ret
%shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
ret <2 x i32> %shuffle
}

View File

@@ -569,16 +569,10 @@ define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
; Using sqrdmlah intrinsics
define <4 x i16> @test_vqrdmlah_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-SD-LABEL: test_vqrdmlah_laneq_s16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlah_laneq_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
; CHECK-GI-NEXT: sqrdmlah v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlah_laneq_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmlah v0.4h, v1.4h, v2.h[7]
; CHECK-NEXT: ret
entry:
%lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vqrdmlah_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlah.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
@@ -586,16 +580,10 @@ entry:
}
define <2 x i32> @test_vqrdmlah_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-SD-LABEL: test_vqrdmlah_laneq_s32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlah_laneq_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
; CHECK-GI-NEXT: sqrdmlah v0.2s, v1.2s, v2.2s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlah_laneq_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmlah v0.2s, v1.2s, v2.s[3]
; CHECK-NEXT: ret
entry:
%lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
%vqrdmlah_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlah.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
@@ -700,22 +688,13 @@ entry:
}
define i16 @test_vqrdmlahh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
; CHECK-SD-LABEL: test_vqrdmlahh_laneq_s16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: sqrdmlah v1.4h, v2.4h, v0.h[7]
; CHECK-SD-NEXT: umov w0, v1.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlahh_laneq_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup v0.8h, v0.h[7]
; CHECK-GI-NEXT: fmov s1, w0
; CHECK-GI-NEXT: fmov s2, w1
; CHECK-GI-NEXT: sqrdmlah v1.4h, v2.4h, v0.4h
; CHECK-GI-NEXT: umov w0, v1.h[0]
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlahh_laneq_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: fmov s2, w1
; CHECK-NEXT: sqrdmlah v1.4h, v2.4h, v0.h[7]
; CHECK-NEXT: umov w0, v1.h[0]
; CHECK-NEXT: ret
entry:
%0 = insertelement <4 x i16> undef, i16 %a, i64 0
%1 = insertelement <4 x i16> undef, i16 %b, i64 0
@@ -740,16 +719,10 @@ entry:
}
define <4 x i16> @test_vqrdmlsh_laneq_s16(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) {
; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup v2.8h, v2.h[7]
; CHECK-GI-NEXT: sqrdmlsh v0.4h, v1.4h, v2.4h
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlsh_laneq_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmlsh v0.4h, v1.4h, v2.h[7]
; CHECK-NEXT: ret
entry:
%lane = shufflevector <8 x i16> %v, <8 x i16> poison, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
%vqrdmlsh_v3.i = tail call <4 x i16> @llvm.aarch64.neon.sqrdmlsh.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %lane) #4
@@ -757,16 +730,10 @@ entry:
}
define <2 x i32> @test_vqrdmlsh_laneq_s32(<2 x i32> %a, <2 x i32> %b, <4 x i32> %v) {
; CHECK-SD-LABEL: test_vqrdmlsh_laneq_s32:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlsh_laneq_s32:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup v2.4s, v2.s[3]
; CHECK-GI-NEXT: sqrdmlsh v0.2s, v1.2s, v2.2s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlsh_laneq_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sqrdmlsh v0.2s, v1.2s, v2.s[3]
; CHECK-NEXT: ret
entry:
%lane = shufflevector <4 x i32> %v, <4 x i32> poison, <2 x i32> <i32 3, i32 3>
%vqrdmlsh_v3.i = tail call <2 x i32> @llvm.aarch64.neon.sqrdmlsh.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %lane) #4
@@ -871,22 +838,13 @@ entry:
}
define i16 @test_vqrdmlshh_laneq_s16(i16 %a, i16 %b, <8 x i16> %c) {
; CHECK-SD-LABEL: test_vqrdmlshh_laneq_s16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: fmov s1, w0
; CHECK-SD-NEXT: fmov s2, w1
; CHECK-SD-NEXT: sqrdmlsh v1.4h, v2.4h, v0.h[7]
; CHECK-SD-NEXT: umov w0, v1.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: test_vqrdmlshh_laneq_s16:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: dup v0.8h, v0.h[7]
; CHECK-GI-NEXT: fmov s1, w0
; CHECK-GI-NEXT: fmov s2, w1
; CHECK-GI-NEXT: sqrdmlsh v1.4h, v2.4h, v0.4h
; CHECK-GI-NEXT: umov w0, v1.h[0]
; CHECK-GI-NEXT: ret
; CHECK-LABEL: test_vqrdmlshh_laneq_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov s1, w0
; CHECK-NEXT: fmov s2, w1
; CHECK-NEXT: sqrdmlsh v1.4h, v2.4h, v0.h[7]
; CHECK-NEXT: umov w0, v1.h[0]
; CHECK-NEXT: ret
entry:
%0 = insertelement <4 x i16> undef, i16 %a, i64 0
%1 = insertelement <4 x i16> undef, i16 %b, i64 0