[AMDGPU] Improve the lowering of raw_buffer_load_{i8,i16} and struct_buffer_load_{i8,i16} intrinsics
Currently, raw_buffer_load_{i8,i16} and struct_buffer_load_{i8,i16}
intrinsics are lowered as buffer_load_{u8,u16}. This patch combines
buffer_load_{u8,u16} and sign extension instructions in order to
generate buffer_load_{i8,i16} instructions.
Reviewed By: foad
Differential Revision: https://reviews.llvm.org/D144313
This commit is contained in:
@@ -104,6 +104,14 @@ def foldable_fneg : GICombineRule<
|
||||
[{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]),
|
||||
(apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>;
|
||||
|
||||
def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
|
||||
|
||||
def sign_extension_in_reg : GICombineRule<
|
||||
(defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo),
|
||||
(match (wip_match_opcode G_SEXT_INREG):$sign_inreg,
|
||||
[{ return PostLegalizerHelper.matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]),
|
||||
(apply [{ PostLegalizerHelper.applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>;
|
||||
|
||||
// Combines which should only apply on SI/VI
|
||||
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
|
||||
|
||||
@@ -119,7 +127,7 @@ def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
|
||||
"AMDGPUGenPostLegalizerCombinerHelper",
|
||||
[all_combines, gfx6gfx7_combines,
|
||||
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
|
||||
rcp_sqrt_to_rsq]> {
|
||||
rcp_sqrt_to_rsq, sign_extension_in_reg]> {
|
||||
let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
|
||||
let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
|
||||
let AdditionalArguments = [];
|
||||
|
||||
@@ -36,12 +36,16 @@ protected:
|
||||
MachineIRBuilder &B;
|
||||
MachineFunction &MF;
|
||||
MachineRegisterInfo &MRI;
|
||||
const GCNSubtarget &Subtarget;
|
||||
const SIInstrInfo &TII;
|
||||
AMDGPUCombinerHelper &Helper;
|
||||
|
||||
public:
|
||||
AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
|
||||
AMDGPUCombinerHelper &Helper)
|
||||
: B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
|
||||
: B(B), MF(B.getMF()), MRI(*B.getMRI()),
|
||||
Subtarget(MF.getSubtarget<GCNSubtarget>()),
|
||||
TII(*Subtarget.getInstrInfo()), Helper(Helper){};
|
||||
|
||||
struct FMinFMaxLegacyInfo {
|
||||
Register LHS;
|
||||
@@ -74,6 +78,11 @@ public:
|
||||
const CvtF32UByteMatchInfo &MatchInfo);
|
||||
|
||||
bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
|
||||
|
||||
// Combine unsigned buffer load and signed extension instructions to generate
|
||||
// signed buffer laod instructions.
|
||||
bool matchCombineSignExtendInReg(MachineInstr &MI, MachineInstr *&MatchInfo);
|
||||
void applyCombineSignExtendInReg(MachineInstr &MI, MachineInstr *&MatchInfo);
|
||||
};
|
||||
|
||||
bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
|
||||
@@ -302,6 +311,45 @@ bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
|
||||
return TLI->isCanonicalized(Reg, MF);
|
||||
}
|
||||
|
||||
// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
|
||||
// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
|
||||
// with sign extension instrucions in order to generate buffer_load_{i8, i16}
|
||||
// instructions.
|
||||
|
||||
// Identify buffer_load_{u8, u16}.
|
||||
bool AMDGPUPostLegalizerCombinerHelper::matchCombineSignExtendInReg(
|
||||
MachineInstr &MI, MachineInstr *&SubwordBufferLoad) {
|
||||
Register Op0Reg = MI.getOperand(1).getReg();
|
||||
SubwordBufferLoad = MRI.getVRegDef(Op0Reg);
|
||||
|
||||
if (!MRI.hasOneNonDBGUse(Op0Reg))
|
||||
return false;
|
||||
|
||||
// Check if the first operand of the sign extension is a subword buffer load
|
||||
// instruction.
|
||||
return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
|
||||
SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
|
||||
}
|
||||
|
||||
// Combine buffer_load_{u8, u16} and the sign extension instruction to generate
|
||||
// buffer_load_{i8, i16}.
|
||||
void AMDGPUPostLegalizerCombinerHelper::applyCombineSignExtendInReg(
|
||||
MachineInstr &MI, MachineInstr *&SubwordBufferLoad) {
|
||||
// Modify the opcode and the destination of buffer_load_{u8, u16}:
|
||||
// Replace the opcode.
|
||||
unsigned Opc =
|
||||
SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
|
||||
? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
|
||||
: AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
|
||||
SubwordBufferLoad->setDesc(TII.get(Opc));
|
||||
// Update the destination register of SubwordBufferLoad with the destination
|
||||
// register of the sign extension.
|
||||
Register SignExtendInsnDst = MI.getOperand(0).getReg();
|
||||
SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
|
||||
// Remove the sign extension.
|
||||
MI.eraseFromParent();
|
||||
}
|
||||
|
||||
class AMDGPUPostLegalizerCombinerHelperState {
|
||||
protected:
|
||||
AMDGPUCombinerHelper &Helper;
|
||||
|
||||
@@ -442,9 +442,8 @@ define amdgpu_ps float @raw_buffer_load_i8__sgpr_rsrc__vgpr_voffset__sgpr_soffse
|
||||
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7)
|
||||
; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_OFFEN]], 0, 8, implicit $exec
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_OFFEN]]
|
||||
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%val = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%zext = sext i8 %val to i32
|
||||
@@ -485,9 +484,8 @@ define amdgpu_ps float @raw_buffer_load_i16__sgpr_rsrc__vgpr_voffset__sgpr_soffs
|
||||
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
|
||||
; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7)
|
||||
; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_OFFEN]], 0, 16, implicit $exec
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_OFFEN [[COPY4]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_OFFEN]]
|
||||
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%val = call i16 @llvm.amdgcn.raw.buffer.load.i16(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
|
||||
%sext = sext i16 %val to i32
|
||||
|
||||
@@ -267,9 +267,8 @@ define amdgpu_ps float @struct_buffer_load_i8_sext__sgpr_rsrc__vgpr_vindex__vgpr
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_UBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7)
|
||||
; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_UBYTE_BOTHEN]], 0, 8, implicit $exec
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_SBYTE_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SBYTE_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s8), addrspace 7)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SBYTE_BOTHEN]]
|
||||
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%val = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ext = sext i8 %val to i32
|
||||
@@ -314,9 +313,8 @@ define amdgpu_ps float @struct_buffer_load_i16_sext__sgpr_rsrc__vgpr_vindex__vgp
|
||||
; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
|
||||
; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_USHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7)
|
||||
; CHECK-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[BUFFER_LOAD_USHORT_BOTHEN]], 0, 16, implicit $exec
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[V_BFE_I32_e64_]]
|
||||
; CHECK-NEXT: [[BUFFER_LOAD_SSHORT_BOTHEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_SSHORT_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, implicit $exec :: (dereferenceable load (s16), align 1, addrspace 7)
|
||||
; CHECK-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_SSHORT_BOTHEN]]
|
||||
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%val = call i16 @llvm.amdgcn.struct.buffer.load.i16(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
|
||||
%ext = sext i16 %val to i32
|
||||
|
||||
Reference in New Issue
Block a user