Files
clang-p2996/llvm/test/CodeGen/AArch64/aarch64-pmull2.ll
Harvin Iriawan db158c7c83 [AArch64] Update generic sched model to A510
Refresh of the generic scheduling model to use A510 instead of A55.
  Main benefits are to the little core, and introducing SVE scheduling information.
  Changes tested on various OoO cores, no performance degradation is seen.

  Differential Revision: https://reviews.llvm.org/D156799
2023-08-21 12:25:15 +01:00

79 lines
3.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -verify-machineinstrs -mtriple=aarch64-linux-gnu -mattr=+aes -o - %s| FileCheck %s --check-prefixes=CHECK
; User code intends to execute {pmull, pmull2} instructions on {lower, higher} half of the same vector registers directly.
; Test that PMULL2 are generated for higher-half operands.
; The suboptimal code generation fails to use higher-half contents in place; instead, it moves higher-lane contents to lower lane
; to make use of PMULL everywhere, and generates unnecessary moves.
define void @test1(ptr %0, ptr %1) {
; CHECK-LABEL: test1:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #56824 // =0xddf8
; CHECK-NEXT: mov w9, #61186 // =0xef02
; CHECK-NEXT: movk w8, #40522, lsl #16
; CHECK-NEXT: movk w9, #29710, lsl #16
; CHECK-NEXT: ldp q0, q1, [x1]
; CHECK-NEXT: dup v2.2d, x8
; CHECK-NEXT: fmov d3, x9
; CHECK-NEXT: pmull v4.1q, v0.1d, v3.1d
; CHECK-NEXT: pmull v3.1q, v1.1d, v3.1d
; CHECK-NEXT: pmull2 v0.1q, v0.2d, v2.2d
; CHECK-NEXT: pmull2 v1.1q, v1.2d, v2.2d
; CHECK-NEXT: eor v0.16b, v4.16b, v0.16b
; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b
; CHECK-NEXT: stp q0, q1, [x1]
; CHECK-NEXT: ret
%3 = load <2 x i64>, ptr %1
%4 = getelementptr inbounds <2 x i64>, ptr %1, i64 1
%5 = load <2 x i64>, ptr %4
%6 = extractelement <2 x i64> %3, i64 1
%7 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %6, i64 2655706616)
%8 = extractelement <2 x i64> %5, i64 1
%9 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %8, i64 2655706616)
%10 = load <2 x i64>, ptr %0
%11 = getelementptr inbounds i8, ptr %0, i64 16
%12 = load <2 x i64>, ptr %11
%13 = extractelement <2 x i64> %3, i64 0
%14 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %13, i64 1947135746)
%15 = extractelement <2 x i64> %5, i64 0
%16 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %15, i64 1947135746)
%17 = xor <16 x i8> %14, %7
%18 = xor <16 x i8> %16, %9
store <16 x i8> %17, ptr %1
store <16 x i8> %18, ptr %4
ret void
}
; One operand is higher-half of SIMD register, and the other operand is lower-half of another SIMD register.
; Tests that codegen doesn't generate unnecessary moves.
define void @test2(ptr %0, <2 x i64> %1, <2 x i64> %2) {
; CHECK-LABEL: test2:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v1.2d, v1.d[0]
; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%4 = extractelement <2 x i64> %1, i64 1
%5 = extractelement <2 x i64> %2, i64 0
%6 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %5)
store <16 x i8> %6, ptr %0, align 16
ret void
}
; Operand %4 is the higher-half of v2i64, and operand %2 is an input parameter of i64.
; Test that %2 is duplicated into the proper lane of SIMD directly for optimal codegen.
define void @test3(ptr %0, <2 x i64> %1, i64 %2) {
; CHECK-LABEL: test3:
; CHECK: // %bb.0:
; CHECK-NEXT: dup v1.2d, x1
; CHECK-NEXT: pmull2 v0.1q, v0.2d, v1.2d
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret
%4 = extractelement <2 x i64> %1, i64 1
%5 = tail call <16 x i8> @llvm.aarch64.neon.pmull64(i64 %4, i64 %2)
store <16 x i8> %5, ptr %0, align 16
ret void
}
declare <16 x i8> @llvm.aarch64.neon.pmull64(i64, i64)