Across-lane intrinsics with integer destination type (uaddv, saddv, umaxv, smavx, uminv, sminv) were legalized with the destination type given in the LLVM IR intrinsic’s definition. It was wider than the actual destination type of the corresponding machine instruction. InstructionSelect was implicitly supposed to generate underlying extension instructions for these intrinsics, while the real destination type was opaque for other GlobalISel passes. Thus, llvm/test/CodeGen/AArch64/arm64-vaddv.ll failed on GlobalISel since the generated code was worse in functions that used the value of an across-lane intrinsic in following FP&SIMD instructions (functions with _used_by_laneop suffix). Here intrinsics are legalized and selected with an actual destination type, making it transparent to other passes. If the destination value is used in further instructions accepting FPR registers, there won’t be extra copies across register banks. i16 type is added to the list of the types of the FPR16 register bank to make it possible, and a few SelectionDAG patterns are modified to eliminate ambiguity in TableGen. Differential Revision: https://reviews.llvm.org/D156831
165 lines
5.2 KiB
LLVM
165 lines
5.2 KiB
LLVM
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
|
|
; RUN: llc < %s -global-isel=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
|
|
|
|
define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
|
|
; CHECK-LABEL: vmin_u8x8:
|
|
; CHECK: uminv.8b b[[REG:[0-9]+]], v0
|
|
; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
|
|
; CHECK-NOT: and
|
|
; CHECK: cbz [[REG2]],
|
|
entry:
|
|
%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
|
|
%tmp = trunc i32 %vminv.i to i8
|
|
%tobool = icmp eq i8 %tmp, 0
|
|
br i1 %tobool, label %return, label %if.then
|
|
|
|
if.then:
|
|
%call1 = tail call i32 @bar() nounwind
|
|
br label %return
|
|
|
|
return:
|
|
%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
|
|
ret i32 %retval.0
|
|
}
|
|
|
|
declare i32 @bar(...)
|
|
|
|
define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
|
|
; CHECK-LABEL: vmin_u4x16:
|
|
; CHECK: uminv.4h h[[REG:[0-9]+]], v0
|
|
; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
|
|
; CHECK-NOT: and
|
|
; CHECK: cbz [[REG2]],
|
|
entry:
|
|
%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
|
|
%tmp = trunc i32 %vminv.i to i16
|
|
%tobool = icmp eq i16 %tmp, 0
|
|
br i1 %tobool, label %return, label %if.then
|
|
|
|
if.then:
|
|
%call1 = tail call i32 @bar() nounwind
|
|
br label %return
|
|
|
|
return:
|
|
%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
|
|
ret i32 %retval.0
|
|
}
|
|
|
|
define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
|
|
; CHECK-LABEL: vmin_u8x16:
|
|
; CHECK: uminv.8h h[[REG:[0-9]+]], v0
|
|
; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
|
|
; CHECK-NOT: and
|
|
; CHECK: cbz [[REG2]],
|
|
entry:
|
|
%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
|
|
%tmp = trunc i32 %vminv.i to i16
|
|
%tobool = icmp eq i16 %tmp, 0
|
|
br i1 %tobool, label %return, label %if.then
|
|
|
|
if.then:
|
|
%call1 = tail call i32 @bar() nounwind
|
|
br label %return
|
|
|
|
return:
|
|
%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
|
|
ret i32 %retval.0
|
|
}
|
|
|
|
define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
|
|
; CHECK-LABEL: vmin_u16x8:
|
|
; CHECK: uminv.16b b[[REG:[0-9]+]], v0
|
|
; CHECK: fmov [[REG2:w[0-9]+]], s[[REG]]
|
|
; CHECK-NOT: and
|
|
; CHECK: cbz [[REG2]],
|
|
entry:
|
|
%vminv.i = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
|
|
%tmp = trunc i32 %vminv.i to i8
|
|
%tobool = icmp eq i8 %tmp, 0
|
|
br i1 %tobool, label %return, label %if.then
|
|
|
|
if.then:
|
|
%call1 = tail call i32 @bar() nounwind
|
|
br label %return
|
|
|
|
return:
|
|
%retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
|
|
ret i32 %retval.0
|
|
}
|
|
|
|
define <8 x i8> @test_vminv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
|
|
; CHECK-LABEL: test_vminv_u8_used_by_laneop:
|
|
; CHECK: uminv.8b b[[REGNUM:[0-9]+]], v1
|
|
; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8> %a2)
|
|
%1 = trunc i32 %0 to i8
|
|
%2 = insertelement <8 x i8> %a1, i8 %1, i32 3
|
|
ret <8 x i8> %2
|
|
}
|
|
|
|
define <4 x i16> @test_vminv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
|
|
; CHECK-LABEL: test_vminv_u16_used_by_laneop:
|
|
; CHECK: uminv.4h h[[REGNUM:[0-9]+]], v1
|
|
; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16> %a2)
|
|
%1 = trunc i32 %0 to i16
|
|
%2 = insertelement <4 x i16> %a1, i16 %1, i32 3
|
|
ret <4 x i16> %2
|
|
}
|
|
|
|
define <2 x i32> @test_vminv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
|
|
; CHECK-LABEL: test_vminv_u32_used_by_laneop:
|
|
; CHECK: uminp.2s v[[REGNUM:[0-9]+]], v1, v1
|
|
; CHECK-NEXT: mov.s v0[1], v[[REGNUM]][0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32> %a2)
|
|
%1 = insertelement <2 x i32> %a1, i32 %0, i32 1
|
|
ret <2 x i32> %1
|
|
}
|
|
|
|
define <16 x i8> @test_vminvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
|
|
; CHECK-LABEL: test_vminvq_u8_used_by_laneop:
|
|
; CHECK: uminv.16b b[[REGNUM:[0-9]+]], v1
|
|
; CHECK-NEXT: mov.b v0[3], v[[REGNUM]][0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8> %a2)
|
|
%1 = trunc i32 %0 to i8
|
|
%2 = insertelement <16 x i8> %a1, i8 %1, i32 3
|
|
ret <16 x i8> %2
|
|
}
|
|
|
|
define <8 x i16> @test_vminvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
|
|
; CHECK-LABEL: test_vminvq_u16_used_by_laneop:
|
|
; CHECK: uminv.8h h[[REGNUM:[0-9]+]], v1
|
|
; CHECK-NEXT: mov.h v0[3], v[[REGNUM]][0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16> %a2)
|
|
%1 = trunc i32 %0 to i16
|
|
%2 = insertelement <8 x i16> %a1, i16 %1, i32 3
|
|
ret <8 x i16> %2
|
|
}
|
|
|
|
define <4 x i32> @test_vminvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
|
|
; CHECK-LABEL: test_vminvq_u32_used_by_laneop:
|
|
; CHECK: uminv.4s s[[REGNUM:[0-9]+]], v1
|
|
; CHECK-NEXT: mov.s v0[3], v[[REGNUM]][0]
|
|
; CHECK-NEXT: ret
|
|
entry:
|
|
%0 = tail call i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32> %a2)
|
|
%1 = insertelement <4 x i32> %a1, i32 %0, i32 3
|
|
ret <4 x i32> %1
|
|
}
|
|
declare i32 @llvm.aarch64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
|
|
declare i32 @llvm.aarch64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
|
|
declare i32 @llvm.aarch64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
|
|
declare i32 @llvm.aarch64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
|
|
declare i32 @llvm.aarch64.neon.uminv.i32.v2i32(<2 x i32>) nounwind readnone
|
|
declare i32 @llvm.aarch64.neon.uminv.i32.v4i32(<4 x i32>) nounwind readnone
|