From e29d092af8b0ed2b15ce1dfd9fc4caef1976eef7 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 6 Nov 2024 08:20:39 +0000 Subject: [PATCH] [X86] getFauxShuffleMask - add ISD::SHL/SRL handling This is currently mostly the same as the VSHLI/VSRLI handling below, although I've kept them separate as I'm investigating adding non-uniform shift amount handling as a followup --- llvm/lib/Target/X86/X86ISelLowering.cpp | 24 +++++++++ llvm/test/CodeGen/X86/combine-pmuldq.ll | 66 ++++++++++++------------- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 9de202d2a89a..f0e10fe66868 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -6270,6 +6270,30 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(Src); return true; } + case ISD::SHL: + case ISD::SRL: { + // We can only decode 'whole byte' bit shifts as shuffles. + std::optional Amt = DAG.getValidShiftAmount(N, DemandedElts); + if (!Amt || (*Amt % 8) != 0) + return false; + + uint64_t ByteShift = *Amt / 8; + Ops.push_back(N.getOperand(0)); + + // Clear mask to all zeros and insert the shifted byte indices. + Mask.append(NumSizeInBytes, SM_SentinelZero); + + if (ISD::SHL == Opcode) { + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) + for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) + Mask[i + j] = i + j - ByteShift; + } else { + for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) + for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) + Mask[i + j - ByteShift] = i + j; + } + return true; + } case X86ISD::VSHLI: case X86ISD::VSRLI: { uint64_t ShiftVal = N.getConstantOperandVal(1); diff --git a/llvm/test/CodeGen/X86/combine-pmuldq.ll b/llvm/test/CodeGen/X86/combine-pmuldq.ll index 4efde4b8d753..0748ca626bcf 100644 --- a/llvm/test/CodeGen/X86/combine-pmuldq.ll +++ b/llvm/test/CodeGen/X86/combine-pmuldq.ll @@ -246,19 +246,19 @@ define i32 @PR43159(ptr %a0) { ; AVX2-LABEL: PR43159: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX2-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX2-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX2-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX2-NEXT: vmovd %xmm0, %edi ; AVX2-NEXT: vpextrd $1, %xmm0, %esi @@ -269,19 +269,19 @@ define i32 @PR43159(ptr %a0) { ; AVX512VL-LABEL: PR43159: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512VL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512VL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512VL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512VL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512VL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512VL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512VL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512VL-NEXT: vmovd %xmm0, %edi ; AVX512VL-NEXT: vpextrd $1, %xmm0, %esi @@ -292,19 +292,19 @@ define i32 @PR43159(ptr %a0) { ; AVX512DQVL-LABEL: PR43159: ; AVX512DQVL: # %bb.0: # %entry ; AVX512DQVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] ; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 -; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX512DQVL-NEXT: vpsubd %xmm2, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 +; AVX512DQVL-NEXT: vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 +; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] +; AVX512DQVL-NEXT: vpsubd %xmm1, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648] -; AVX512DQVL-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX512DQVL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3] -; AVX512DQVL-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpbroadcastd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648] +; AVX512DQVL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512DQVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512DQVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] +; AVX512DQVL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512DQVL-NEXT: vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX512DQVL-NEXT: vmovd %xmm0, %edi ; AVX512DQVL-NEXT: vpextrd $1, %xmm0, %esi