[NVPTX] Vectorize and lower 256-bit global loads/stores for sm_100+/ptx88+ (#139292)
PTX 8.8+ introduces 256-bit-wide vector loads/stores under certain conditions. This change extends the backend to lower these loads/stores. It also overrides getLoadStoreVecRegBitWidth for NVPTX, allowing the LoadStoreVectorizer to create these wider vector operations. See the spec for the three relevant PTX instructions here: - https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld - https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-ld-global-nc - https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-st
This commit is contained in:
@@ -311,17 +311,6 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
|
||||
default:
|
||||
llvm_unreachable("Unknown register type");
|
||||
}
|
||||
} else if (Modifier == "vec") {
|
||||
switch (Imm) {
|
||||
case NVPTX::PTXLdStInstCode::V2:
|
||||
O << ".v2";
|
||||
return;
|
||||
case NVPTX::PTXLdStInstCode::V4:
|
||||
O << ".v4";
|
||||
return;
|
||||
}
|
||||
// TODO: evaluate whether cases not covered by this switch are bugs
|
||||
return;
|
||||
}
|
||||
llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str());
|
||||
}
|
||||
|
||||
@@ -190,17 +190,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
|
||||
};
|
||||
|
||||
namespace PTXLdStInstCode {
|
||||
enum FromType {
|
||||
Unsigned = 0,
|
||||
Signed,
|
||||
Float,
|
||||
Untyped
|
||||
};
|
||||
enum VecType {
|
||||
Scalar = 1,
|
||||
V2 = 2,
|
||||
V4 = 4
|
||||
};
|
||||
enum FromType { Unsigned = 0, Signed, Float, Untyped };
|
||||
} // namespace PTXLdStInstCode
|
||||
|
||||
/// PTXCvtMode - Conversion code enumeration
|
||||
|
||||
@@ -105,7 +105,7 @@ static bool eliminateMove(MachineInstr &Mov, const MachineRegisterInfo &MRI,
|
||||
const MachineOperand *ParamSymbol = Mov.uses().begin();
|
||||
assert(ParamSymbol->isSymbol());
|
||||
|
||||
constexpr unsigned LDInstBasePtrOpIdx = 6;
|
||||
constexpr unsigned LDInstBasePtrOpIdx = 5;
|
||||
constexpr unsigned LDInstAddrSpaceOpIdx = 2;
|
||||
for (auto *LI : LoadInsts) {
|
||||
(LI->uses().begin() + LDInstBasePtrOpIdx)
|
||||
|
||||
@@ -129,6 +129,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
|
||||
return;
|
||||
case NVPTXISD::LoadV2:
|
||||
case NVPTXISD::LoadV4:
|
||||
case NVPTXISD::LoadV8:
|
||||
if (tryLoadVector(N))
|
||||
return;
|
||||
break;
|
||||
@@ -139,6 +140,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
|
||||
break;
|
||||
case NVPTXISD::StoreV2:
|
||||
case NVPTXISD::StoreV4:
|
||||
case NVPTXISD::StoreV8:
|
||||
if (tryStoreVector(N))
|
||||
return;
|
||||
break;
|
||||
@@ -1012,11 +1014,11 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
|
||||
|
||||
// Helper function template to reduce amount of boilerplate code for
|
||||
// opcode selection.
|
||||
static std::optional<unsigned>
|
||||
pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
|
||||
unsigned Opcode_i16, unsigned Opcode_i32,
|
||||
std::optional<unsigned> Opcode_i64, unsigned Opcode_f32,
|
||||
std::optional<unsigned> Opcode_f64) {
|
||||
static std::optional<unsigned> pickOpcodeForVT(
|
||||
MVT::SimpleValueType VT, std::optional<unsigned> Opcode_i8,
|
||||
std::optional<unsigned> Opcode_i16, std::optional<unsigned> Opcode_i32,
|
||||
std::optional<unsigned> Opcode_i64, std::optional<unsigned> Opcode_f32,
|
||||
std::optional<unsigned> Opcode_f64) {
|
||||
switch (VT) {
|
||||
case MVT::i1:
|
||||
case MVT::i8:
|
||||
@@ -1091,7 +1093,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
SDValue Ops[] = {getI32Imm(Ordering, DL),
|
||||
getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
|
||||
getI32Imm(FromType, DL),
|
||||
getI32Imm(FromTypeWidth, DL),
|
||||
Base,
|
||||
@@ -1128,6 +1129,22 @@ static bool isSubVectorPackedInI32(EVT EltVT) {
|
||||
return Isv2x16VT(EltVT) || EltVT == MVT::v4i8;
|
||||
}
|
||||
|
||||
static unsigned getLoadStoreVectorNumElts(SDNode *N) {
|
||||
switch (N->getOpcode()) {
|
||||
case NVPTXISD::LoadV2:
|
||||
case NVPTXISD::StoreV2:
|
||||
return 2;
|
||||
case NVPTXISD::LoadV4:
|
||||
case NVPTXISD::StoreV4:
|
||||
return 4;
|
||||
case NVPTXISD::LoadV8:
|
||||
case NVPTXISD::StoreV8:
|
||||
return 8;
|
||||
default:
|
||||
llvm_unreachable("Unexpected opcode");
|
||||
}
|
||||
}
|
||||
|
||||
bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
MemSDNode *MemSD = cast<MemSDNode>(N);
|
||||
const EVT MemEVT = MemSD->getMemoryVT();
|
||||
@@ -1159,20 +1176,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
? NVPTX::PTXLdStInstCode::Signed
|
||||
: NVPTX::PTXLdStInstCode::Untyped;
|
||||
|
||||
unsigned VecType;
|
||||
unsigned FromTypeWidth;
|
||||
switch (N->getOpcode()) {
|
||||
case NVPTXISD::LoadV2:
|
||||
FromTypeWidth = TotalWidth / 2;
|
||||
VecType = NVPTX::PTXLdStInstCode::V2;
|
||||
break;
|
||||
case NVPTXISD::LoadV4:
|
||||
FromTypeWidth = TotalWidth / 4;
|
||||
VecType = NVPTX::PTXLdStInstCode::V4;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N);
|
||||
|
||||
if (isSubVectorPackedInI32(EltVT)) {
|
||||
assert(ExtensionType == ISD::NON_EXTLOAD);
|
||||
@@ -1180,14 +1184,13 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
}
|
||||
|
||||
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
|
||||
FromTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for load");
|
||||
FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load");
|
||||
|
||||
SDValue Offset, Base;
|
||||
SelectADDR(N->getOperand(1), Base, Offset);
|
||||
SDValue Ops[] = {getI32Imm(Ordering, DL),
|
||||
getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL),
|
||||
getI32Imm(VecType, DL),
|
||||
getI32Imm(FromType, DL),
|
||||
getI32Imm(FromTypeWidth, DL),
|
||||
Base,
|
||||
@@ -1205,9 +1208,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
NVPTX::LDV_f32_v2, NVPTX::LDV_f64_v2);
|
||||
break;
|
||||
case NVPTXISD::LoadV4:
|
||||
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
|
||||
NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4, std::nullopt,
|
||||
NVPTX::LDV_f32_v4, std::nullopt);
|
||||
Opcode =
|
||||
pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4,
|
||||
NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4,
|
||||
NVPTX::LDV_f32_v4, NVPTX::LDV_f64_v4);
|
||||
break;
|
||||
case NVPTXISD::LoadV8:
|
||||
Opcode =
|
||||
pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
|
||||
{/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */},
|
||||
NVPTX::LDV_f32_v8, {/* no v8f64 */});
|
||||
break;
|
||||
}
|
||||
if (!Opcode)
|
||||
@@ -1303,13 +1313,20 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
|
||||
Opcode = pickOpcodeForVT(
|
||||
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE,
|
||||
NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE,
|
||||
std::nullopt, NVPTX::INT_PTX_LDG_G_v4f32_ELE, std::nullopt);
|
||||
NVPTX::INT_PTX_LDG_G_v4i64_ELE, NVPTX::INT_PTX_LDG_G_v4f32_ELE,
|
||||
NVPTX::INT_PTX_LDG_G_v4f64_ELE);
|
||||
break;
|
||||
case NVPTXISD::LDUV4:
|
||||
Opcode = pickOpcodeForVT(
|
||||
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE,
|
||||
NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE,
|
||||
std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt);
|
||||
{/* no v4i64 */}, NVPTX::INT_PTX_LDU_G_v4f32_ELE, {/* no v4f64 */});
|
||||
break;
|
||||
case NVPTXISD::LoadV8:
|
||||
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
|
||||
{/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE,
|
||||
{/* no v8i64 */}, NVPTX::INT_PTX_LDG_G_v8f32_ELE,
|
||||
{/* no v8f64 */});
|
||||
break;
|
||||
}
|
||||
if (!Opcode)
|
||||
@@ -1395,7 +1412,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
getI32Imm(Ordering, DL),
|
||||
getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
|
||||
getI32Imm(ToTypeWidth, DL),
|
||||
Base,
|
||||
@@ -1443,41 +1459,24 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
// - for integer type, always use 'u'
|
||||
const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
|
||||
|
||||
SmallVector<SDValue, 12> Ops;
|
||||
SDValue N2;
|
||||
unsigned VecType;
|
||||
unsigned ToTypeWidth;
|
||||
unsigned NumElts = getLoadStoreVectorNumElts(N);
|
||||
|
||||
switch (N->getOpcode()) {
|
||||
case NVPTXISD::StoreV2:
|
||||
VecType = NVPTX::PTXLdStInstCode::V2;
|
||||
Ops.append({N->getOperand(1), N->getOperand(2)});
|
||||
N2 = N->getOperand(3);
|
||||
ToTypeWidth = TotalWidth / 2;
|
||||
break;
|
||||
case NVPTXISD::StoreV4:
|
||||
VecType = NVPTX::PTXLdStInstCode::V4;
|
||||
Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3),
|
||||
N->getOperand(4)});
|
||||
N2 = N->getOperand(5);
|
||||
ToTypeWidth = TotalWidth / 4;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
SmallVector<SDValue, 16> Ops(N->ops().slice(1, NumElts));
|
||||
SDValue N2 = N->getOperand(NumElts + 1);
|
||||
unsigned ToTypeWidth = TotalWidth / NumElts;
|
||||
|
||||
if (isSubVectorPackedInI32(EltVT)) {
|
||||
EltVT = MVT::i32;
|
||||
}
|
||||
|
||||
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
|
||||
TotalWidth <= 128 && "Invalid width for store");
|
||||
TotalWidth <= 256 && "Invalid width for store");
|
||||
|
||||
SDValue Offset, Base;
|
||||
SelectADDR(N2, Base, Offset);
|
||||
|
||||
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
|
||||
getI32Imm(CodeAddrSpace, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
|
||||
getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
|
||||
|
||||
@@ -1492,9 +1491,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
NVPTX::STV_f32_v2, NVPTX::STV_f64_v2);
|
||||
break;
|
||||
case NVPTXISD::StoreV4:
|
||||
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
|
||||
NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, std::nullopt,
|
||||
NVPTX::STV_f32_v4, std::nullopt);
|
||||
Opcode =
|
||||
pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4,
|
||||
NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, NVPTX::STV_i64_v4,
|
||||
NVPTX::STV_f32_v4, NVPTX::STV_f64_v4);
|
||||
break;
|
||||
case NVPTXISD::StoreV8:
|
||||
Opcode =
|
||||
pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */},
|
||||
{/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */},
|
||||
NVPTX::STV_f32_v8, {/* no v8f64 */});
|
||||
break;
|
||||
}
|
||||
|
||||
@@ -1554,10 +1560,10 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
|
||||
NVPTX::LoadParamMemV2F64);
|
||||
break;
|
||||
case 4:
|
||||
Opcode =
|
||||
pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
|
||||
NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32,
|
||||
std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt);
|
||||
Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
|
||||
NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16,
|
||||
NVPTX::LoadParamMemV4I32, {/* no v4i64 */},
|
||||
NVPTX::LoadParamMemV4F32, {/* no v4f64 */});
|
||||
break;
|
||||
}
|
||||
if (!Opcode)
|
||||
@@ -1648,8 +1654,8 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
|
||||
case 4:
|
||||
Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
|
||||
NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
|
||||
NVPTX::StoreRetvalV4I32, std::nullopt,
|
||||
NVPTX::StoreRetvalV4F32, std::nullopt);
|
||||
NVPTX::StoreRetvalV4I32, {/* no v4i64 */},
|
||||
NVPTX::StoreRetvalV4F32, {/* no v4f64 */});
|
||||
break;
|
||||
}
|
||||
if (!Opcode)
|
||||
|
||||
@@ -162,6 +162,14 @@ static bool IsPTXVectorType(MVT VT) {
|
||||
case MVT::v2f32:
|
||||
case MVT::v4f32:
|
||||
case MVT::v2f64:
|
||||
case MVT::v4i64:
|
||||
case MVT::v4f64:
|
||||
case MVT::v8i32:
|
||||
case MVT::v8f32:
|
||||
case MVT::v16f16: // <8 x f16x2>
|
||||
case MVT::v16bf16: // <8 x bf16x2>
|
||||
case MVT::v16i16: // <8 x i16x2>
|
||||
case MVT::v32i8: // <8 x i8x4>
|
||||
return true;
|
||||
}
|
||||
}
|
||||
@@ -179,7 +187,7 @@ static bool Is16bitsType(MVT VT) {
|
||||
// - unsigned int NumElts - The number of elements in the final vector
|
||||
// - EVT EltVT - The type of the elements in the final vector
|
||||
static std::optional<std::pair<unsigned int, MVT>>
|
||||
getVectorLoweringShape(EVT VectorEVT) {
|
||||
getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) {
|
||||
if (!VectorEVT.isSimple())
|
||||
return std::nullopt;
|
||||
const MVT VectorVT = VectorEVT.getSimpleVT();
|
||||
@@ -199,6 +207,15 @@ getVectorLoweringShape(EVT VectorEVT) {
|
||||
switch (VectorVT.SimpleTy) {
|
||||
default:
|
||||
return std::nullopt;
|
||||
case MVT::v4i64:
|
||||
case MVT::v4f64:
|
||||
case MVT::v8i32:
|
||||
case MVT::v8f32:
|
||||
// This is a "native" vector type iff the address space is global
|
||||
// and the target supports 256-bit loads/stores
|
||||
if (!CanLowerTo256Bit)
|
||||
return std::nullopt;
|
||||
LLVM_FALLTHROUGH;
|
||||
case MVT::v2i8:
|
||||
case MVT::v2i16:
|
||||
case MVT::v2i32:
|
||||
@@ -215,6 +232,15 @@ getVectorLoweringShape(EVT VectorEVT) {
|
||||
case MVT::v4f32:
|
||||
// This is a "native" vector type
|
||||
return std::pair(NumElts, EltVT);
|
||||
case MVT::v16f16: // <8 x f16x2>
|
||||
case MVT::v16bf16: // <8 x bf16x2>
|
||||
case MVT::v16i16: // <8 x i16x2>
|
||||
case MVT::v32i8: // <8 x i8x4>
|
||||
// This can be upsized into a "native" vector type iff the address space is
|
||||
// global and the target supports 256-bit loads/stores.
|
||||
if (!CanLowerTo256Bit)
|
||||
return std::nullopt;
|
||||
LLVM_FALLTHROUGH;
|
||||
case MVT::v8i8: // <2 x i8x4>
|
||||
case MVT::v8f16: // <4 x f16x2>
|
||||
case MVT::v8bf16: // <4 x bf16x2>
|
||||
@@ -1070,10 +1096,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
MAKE_CASE(NVPTXISD::ProxyReg)
|
||||
MAKE_CASE(NVPTXISD::LoadV2)
|
||||
MAKE_CASE(NVPTXISD::LoadV4)
|
||||
MAKE_CASE(NVPTXISD::LoadV8)
|
||||
MAKE_CASE(NVPTXISD::LDUV2)
|
||||
MAKE_CASE(NVPTXISD::LDUV4)
|
||||
MAKE_CASE(NVPTXISD::StoreV2)
|
||||
MAKE_CASE(NVPTXISD::StoreV4)
|
||||
MAKE_CASE(NVPTXISD::StoreV8)
|
||||
MAKE_CASE(NVPTXISD::FSHL_CLAMP)
|
||||
MAKE_CASE(NVPTXISD::FSHR_CLAMP)
|
||||
MAKE_CASE(NVPTXISD::BFE)
|
||||
@@ -3211,7 +3239,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (ValVT != MemVT)
|
||||
return SDValue();
|
||||
|
||||
const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT);
|
||||
const auto NumEltsAndEltVT = getVectorLoweringShape(
|
||||
ValVT, STI.has256BitVectorLoadStore(N->getAddressSpace()));
|
||||
if (!NumEltsAndEltVT)
|
||||
return SDValue();
|
||||
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
|
||||
@@ -3239,6 +3268,9 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
case 4:
|
||||
Opcode = NVPTXISD::StoreV4;
|
||||
break;
|
||||
case 8:
|
||||
Opcode = NVPTXISD::StoreV8;
|
||||
break;
|
||||
}
|
||||
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
@@ -5775,7 +5807,8 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG,
|
||||
|
||||
/// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads.
|
||||
static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
SmallVectorImpl<SDValue> &Results) {
|
||||
SmallVectorImpl<SDValue> &Results,
|
||||
const NVPTXSubtarget &STI) {
|
||||
LoadSDNode *LD = cast<LoadSDNode>(N);
|
||||
const EVT ResVT = LD->getValueType(0);
|
||||
const EVT MemVT = LD->getMemoryVT();
|
||||
@@ -5785,7 +5818,8 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
if (ResVT != MemVT)
|
||||
return;
|
||||
|
||||
const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT);
|
||||
const auto NumEltsAndEltVT = getVectorLoweringShape(
|
||||
ResVT, STI.has256BitVectorLoadStore(LD->getAddressSpace()));
|
||||
if (!NumEltsAndEltVT)
|
||||
return;
|
||||
const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
|
||||
@@ -5808,21 +5842,23 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
|
||||
|
||||
unsigned Opcode;
|
||||
SDVTList LdResVTs;
|
||||
switch (NumElts) {
|
||||
default:
|
||||
return;
|
||||
case 2:
|
||||
Opcode = NVPTXISD::LoadV2;
|
||||
LdResVTs = DAG.getVTList(LoadEltVT, LoadEltVT, MVT::Other);
|
||||
break;
|
||||
case 4: {
|
||||
case 4:
|
||||
Opcode = NVPTXISD::LoadV4;
|
||||
LdResVTs =
|
||||
DAG.getVTList({LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, MVT::Other});
|
||||
break;
|
||||
case 8:
|
||||
Opcode = NVPTXISD::LoadV8;
|
||||
break;
|
||||
}
|
||||
}
|
||||
auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
|
||||
ListVTs.push_back(MVT::Other);
|
||||
SDVTList LdResVTs = DAG.getVTList(ListVTs);
|
||||
|
||||
SDLoc DL(LD);
|
||||
|
||||
// Copy regular operands
|
||||
@@ -6094,7 +6130,7 @@ void NVPTXTargetLowering::ReplaceNodeResults(
|
||||
ReplaceBITCAST(N, DAG, Results);
|
||||
return;
|
||||
case ISD::LOAD:
|
||||
ReplaceLoadVector(N, DAG, Results);
|
||||
ReplaceLoadVector(N, DAG, Results, STI);
|
||||
return;
|
||||
case ISD::INTRINSIC_W_CHAIN:
|
||||
ReplaceINTRINSIC_W_CHAIN(N, DAG, Results);
|
||||
|
||||
@@ -84,10 +84,12 @@ enum NodeType : unsigned {
|
||||
FIRST_MEMORY_OPCODE,
|
||||
LoadV2 = FIRST_MEMORY_OPCODE,
|
||||
LoadV4,
|
||||
LoadV8,
|
||||
LDUV2, // LDU.v2
|
||||
LDUV4, // LDU.v4
|
||||
StoreV2,
|
||||
StoreV4,
|
||||
StoreV8,
|
||||
LoadParam,
|
||||
LoadParamV2,
|
||||
LoadParamV4,
|
||||
|
||||
@@ -2391,9 +2391,9 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in {
|
||||
class LD<NVPTXRegClass regclass>
|
||||
: NVPTXInst<
|
||||
(outs regclass:$dst),
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign,
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
|
||||
i32imm:$fromWidth, ADDR:$addr),
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth "
|
||||
"\t$dst, [$addr];", []>;
|
||||
|
||||
let mayLoad=1, hasSideEffects=0 in {
|
||||
@@ -2409,8 +2409,8 @@ class ST<NVPTXRegClass regclass>
|
||||
: NVPTXInst<
|
||||
(outs),
|
||||
(ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
|
||||
LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, ADDR:$addr),
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth"
|
||||
LdStCode:$Sign, i32imm:$toWidth, ADDR:$addr),
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$toWidth"
|
||||
" \t[$addr], $src;", []>;
|
||||
|
||||
let mayStore=1, hasSideEffects=0 in {
|
||||
@@ -2425,52 +2425,71 @@ let mayStore=1, hasSideEffects=0 in {
|
||||
// The following is used only in and after vector elementizations. Vector
|
||||
// elementization happens at the machine instruction level, so the following
|
||||
// instructions never appear in the DAG.
|
||||
multiclass LD_VEC<NVPTXRegClass regclass> {
|
||||
multiclass LD_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
|
||||
def _v2 : NVPTXInst<
|
||||
(outs regclass:$dst1, regclass:$dst2),
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
|
||||
LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
|
||||
"\t{{$dst1, $dst2}}, [$addr];", []>;
|
||||
def _v4 : NVPTXInst<
|
||||
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4),
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
|
||||
LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
|
||||
"\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>;
|
||||
if support_v8 then
|
||||
def _v8 : NVPTXInst<
|
||||
(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
|
||||
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
|
||||
(ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
|
||||
i32imm:$fromWidth, ADDR:$addr),
|
||||
"ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
|
||||
"\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, "
|
||||
"[$addr];", []>;
|
||||
}
|
||||
let mayLoad=1, hasSideEffects=0 in {
|
||||
defm LDV_i8 : LD_VEC<Int16Regs>;
|
||||
defm LDV_i16 : LD_VEC<Int16Regs>;
|
||||
defm LDV_i32 : LD_VEC<Int32Regs>;
|
||||
defm LDV_i32 : LD_VEC<Int32Regs, support_v8 = true>;
|
||||
defm LDV_i64 : LD_VEC<Int64Regs>;
|
||||
defm LDV_f32 : LD_VEC<Float32Regs>;
|
||||
defm LDV_f32 : LD_VEC<Float32Regs, support_v8 = true>;
|
||||
defm LDV_f64 : LD_VEC<Float64Regs>;
|
||||
}
|
||||
|
||||
multiclass ST_VEC<NVPTXRegClass regclass> {
|
||||
multiclass ST_VEC<NVPTXRegClass regclass, bit support_v8 = false> {
|
||||
def _v2 : NVPTXInst<
|
||||
(outs),
|
||||
(ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope,
|
||||
LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth,
|
||||
LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth,
|
||||
ADDR:$addr),
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth "
|
||||
"\t[$addr], {{$src1, $src2}};", []>;
|
||||
def _v4 : NVPTXInst<
|
||||
(outs),
|
||||
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
|
||||
LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec,
|
||||
LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp,
|
||||
LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr),
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth "
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth "
|
||||
"\t[$addr], {{$src1, $src2, $src3, $src4}};", []>;
|
||||
if support_v8 then
|
||||
def _v8 : NVPTXInst<
|
||||
(outs),
|
||||
(ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4,
|
||||
regclass:$src5, regclass:$src6, regclass:$src7, regclass:$src8,
|
||||
LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign,
|
||||
i32imm:$fromWidth, ADDR:$addr),
|
||||
"st${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth "
|
||||
"\t[$addr], "
|
||||
"{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>;
|
||||
}
|
||||
|
||||
let mayStore=1, hasSideEffects=0 in {
|
||||
defm STV_i8 : ST_VEC<Int16Regs>;
|
||||
defm STV_i16 : ST_VEC<Int16Regs>;
|
||||
defm STV_i32 : ST_VEC<Int32Regs>;
|
||||
defm STV_i32 : ST_VEC<Int32Regs, support_v8 = true>;
|
||||
defm STV_i64 : ST_VEC<Int64Regs>;
|
||||
defm STV_f32 : ST_VEC<Float32Regs>;
|
||||
defm STV_f32 : ST_VEC<Float32Regs, support_v8 = true>;
|
||||
defm STV_f64 : ST_VEC<Float64Regs>;
|
||||
}
|
||||
|
||||
|
||||
@@ -2400,6 +2400,12 @@ class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
|
||||
(ins ADDR:$src),
|
||||
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
|
||||
|
||||
class VLDG_G_ELE_V8<string TyStr, NVPTXRegClass regclass> :
|
||||
NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4,
|
||||
regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8),
|
||||
(ins ADDR:$src),
|
||||
"ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>;
|
||||
|
||||
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
|
||||
def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
|
||||
@@ -2413,6 +2419,10 @@ def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
|
||||
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
|
||||
|
||||
def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>;
|
||||
def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>;
|
||||
def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>;
|
||||
def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>;
|
||||
|
||||
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
|
||||
if Supports32 then
|
||||
|
||||
@@ -1808,8 +1808,8 @@ bool NVPTXReplaceImageHandles::replaceImageHandle(MachineOperand &Op,
|
||||
// For CUDA, we preserve the param loads coming from function arguments
|
||||
return false;
|
||||
|
||||
assert(TexHandleDef.getOperand(7).isSymbol() && "Load is not a symbol!");
|
||||
StringRef Sym = TexHandleDef.getOperand(7).getSymbolName();
|
||||
assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
|
||||
StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
|
||||
InstrsToRemove.insert(&TexHandleDef);
|
||||
Op.ChangeToES(Sym.data());
|
||||
MFI->getImageHandleSymbolIndex(Sym);
|
||||
|
||||
@@ -20,6 +20,7 @@
|
||||
#include "NVPTXRegisterInfo.h"
|
||||
#include "llvm/CodeGen/TargetSubtargetInfo.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/Support/NVPTXAddrSpace.h"
|
||||
#include <string>
|
||||
|
||||
#define GET_SUBTARGETINFO_HEADER
|
||||
@@ -72,6 +73,10 @@ public:
|
||||
|
||||
const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
|
||||
|
||||
bool has256BitVectorLoadStore(unsigned AS) const {
|
||||
return SmVersion >= 100 && PTXVersion >= 88 &&
|
||||
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
|
||||
}
|
||||
bool hasAtomAddF64() const { return SmVersion >= 60; }
|
||||
bool hasAtomScope() const { return SmVersion >= 60; }
|
||||
bool hasAtomBitwise64() const { return SmVersion >= 32; }
|
||||
|
||||
@@ -591,6 +591,13 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
|
||||
// 256 bit loads/stores are currently only supported for global address space
|
||||
if (ST->has256BitVectorLoadStore(AddrSpace))
|
||||
return 256;
|
||||
return 128;
|
||||
}
|
||||
|
||||
unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const {
|
||||
if (isa<AllocaInst>(V))
|
||||
return ADDRESS_SPACE_LOCAL;
|
||||
|
||||
@@ -173,6 +173,8 @@ public:
|
||||
bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
|
||||
Intrinsic::ID IID) const override;
|
||||
|
||||
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override;
|
||||
|
||||
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
|
||||
Value *NewV) const override;
|
||||
unsigned getAssumedAddrSpace(const Value *V) const override;
|
||||
|
||||
@@ -40,9 +40,9 @@ registers:
|
||||
- { id: 7, class: float32regs }
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0 = LD_f32 0, 0, 4, 1, 2, 32, &test_param_0, 0
|
||||
%0 = LD_f32 0, 0, 4, 2, 32, &test_param_0, 0
|
||||
%1 = CVT_f64_f32 %0, 0
|
||||
%2 = LD_i32 0, 0, 4, 1, 0, 32, &test_param_1, 0
|
||||
%2 = LD_i32 0, 0, 4, 0, 32, &test_param_1, 0
|
||||
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00
|
||||
%3 = FADD_rnf64ri %1, double 3.250000e+00
|
||||
%4 = CVT_f32_f64 %3, 5
|
||||
@@ -66,9 +66,9 @@ registers:
|
||||
- { id: 7, class: float32regs }
|
||||
body: |
|
||||
bb.0.entry:
|
||||
%0 = LD_f32 0, 0, 4, 1, 2, 32, &test2_param_0, 0
|
||||
%0 = LD_f32 0, 0, 4, 2, 32, &test2_param_0, 0
|
||||
%1 = CVT_f64_f32 %0, 0
|
||||
%2 = LD_i32 0, 0, 4, 1, 0, 32, &test2_param_1, 0
|
||||
%2 = LD_i32 0, 0, 4, 0, 32, &test2_param_1, 0
|
||||
; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000
|
||||
%3 = FADD_rnf64ri %1, double 0x7FF8000000000000
|
||||
%4 = CVT_f32_f64 %3, 5
|
||||
|
||||
520
llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
Normal file
520
llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll
Normal file
@@ -0,0 +1,520 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90
|
||||
; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %}
|
||||
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100
|
||||
; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
|
||||
|
||||
; For 256-bit vectors, check that invariant loads from the
|
||||
; global addrspace are lowered to ld.global.nc.
|
||||
|
||||
define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v32i8(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b16 %rs<16>;
|
||||
; SM90-NEXT: .reg .b32 %r<19>;
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
|
||||
; SM90-NEXT: bfe.u32 %r5, %r4, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs1, %r5;
|
||||
; SM90-NEXT: bfe.u32 %r6, %r3, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs2, %r6;
|
||||
; SM90-NEXT: bfe.u32 %r7, %r2, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs3, %r7;
|
||||
; SM90-NEXT: bfe.u32 %r8, %r1, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs4, %r8;
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r9, %r10, %r11, %r12}, [%rd1];
|
||||
; SM90-NEXT: bfe.u32 %r13, %r12, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs5, %r13;
|
||||
; SM90-NEXT: bfe.u32 %r14, %r11, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs6, %r14;
|
||||
; SM90-NEXT: bfe.u32 %r15, %r10, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs7, %r15;
|
||||
; SM90-NEXT: bfe.u32 %r16, %r9, 0, 8;
|
||||
; SM90-NEXT: cvt.u16.u32 %rs8, %r16;
|
||||
; SM90-NEXT: add.s16 %rs9, %rs8, %rs7;
|
||||
; SM90-NEXT: add.s16 %rs10, %rs6, %rs5;
|
||||
; SM90-NEXT: add.s16 %rs11, %rs4, %rs3;
|
||||
; SM90-NEXT: add.s16 %rs12, %rs2, %rs1;
|
||||
; SM90-NEXT: add.s16 %rs13, %rs9, %rs10;
|
||||
; SM90-NEXT: add.s16 %rs14, %rs11, %rs12;
|
||||
; SM90-NEXT: add.s16 %rs15, %rs13, %rs14;
|
||||
; SM90-NEXT: cvt.u32.u16 %r17, %rs15;
|
||||
; SM90-NEXT: and.b32 %r18, %r17, 255;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r18;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v32i8(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b16 %rs<16>;
|
||||
; SM100-NEXT: .reg .b32 %r<19>;
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs1, %r9;
|
||||
; SM100-NEXT: bfe.u32 %r10, %r7, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs2, %r10;
|
||||
; SM100-NEXT: bfe.u32 %r11, %r6, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs3, %r11;
|
||||
; SM100-NEXT: bfe.u32 %r12, %r5, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs4, %r12;
|
||||
; SM100-NEXT: bfe.u32 %r13, %r4, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs5, %r13;
|
||||
; SM100-NEXT: bfe.u32 %r14, %r3, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs6, %r14;
|
||||
; SM100-NEXT: bfe.u32 %r15, %r2, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs7, %r15;
|
||||
; SM100-NEXT: bfe.u32 %r16, %r1, 0, 8;
|
||||
; SM100-NEXT: cvt.u16.u32 %rs8, %r16;
|
||||
; SM100-NEXT: add.s16 %rs9, %rs8, %rs7;
|
||||
; SM100-NEXT: add.s16 %rs10, %rs6, %rs5;
|
||||
; SM100-NEXT: add.s16 %rs11, %rs4, %rs3;
|
||||
; SM100-NEXT: add.s16 %rs12, %rs2, %rs1;
|
||||
; SM100-NEXT: add.s16 %rs13, %rs9, %rs10;
|
||||
; SM100-NEXT: add.s16 %rs14, %rs11, %rs12;
|
||||
; SM100-NEXT: add.s16 %rs15, %rs13, %rs14;
|
||||
; SM100-NEXT: cvt.u32.u16 %r17, %rs15;
|
||||
; SM100-NEXT: and.b32 %r18, %r17, 255;
|
||||
; SM100-NEXT: st.param.b32 [func_retval0], %r18;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <32 x i8>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <32 x i8> %a, i32 0
|
||||
%v2 = extractelement <32 x i8> %a, i32 4
|
||||
%v3 = extractelement <32 x i8> %a, i32 8
|
||||
%v4 = extractelement <32 x i8> %a, i32 12
|
||||
%v5 = extractelement <32 x i8> %a, i32 16
|
||||
%v6 = extractelement <32 x i8> %a, i32 20
|
||||
%v7 = extractelement <32 x i8> %a, i32 24
|
||||
%v8 = extractelement <32 x i8> %a, i32 28
|
||||
%sum1 = add i8 %v1, %v2
|
||||
%sum2 = add i8 %v3, %v4
|
||||
%sum3 = add i8 %v5, %v6
|
||||
%sum4 = add i8 %v7, %v8
|
||||
%sum5 = add i8 %sum1, %sum2
|
||||
%sum6 = add i8 %sum3, %sum4
|
||||
%sum7 = add i8 %sum5, %sum6
|
||||
ret i8 %sum7
|
||||
}
|
||||
|
||||
define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v16i16(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b16 %rs<16>;
|
||||
; SM90-NEXT: .reg .b32 %r<10>;
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
|
||||
; SM90-NEXT: mov.b32 {%rs1, _}, %r4;
|
||||
; SM90-NEXT: mov.b32 {%rs2, _}, %r3;
|
||||
; SM90-NEXT: mov.b32 {%rs3, _}, %r2;
|
||||
; SM90-NEXT: mov.b32 {%rs4, _}, %r1;
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM90-NEXT: mov.b32 {%rs5, _}, %r8;
|
||||
; SM90-NEXT: mov.b32 {%rs6, _}, %r7;
|
||||
; SM90-NEXT: mov.b32 {%rs7, _}, %r6;
|
||||
; SM90-NEXT: mov.b32 {%rs8, _}, %r5;
|
||||
; SM90-NEXT: add.s16 %rs9, %rs8, %rs7;
|
||||
; SM90-NEXT: add.s16 %rs10, %rs6, %rs5;
|
||||
; SM90-NEXT: add.s16 %rs11, %rs4, %rs3;
|
||||
; SM90-NEXT: add.s16 %rs12, %rs2, %rs1;
|
||||
; SM90-NEXT: add.s16 %rs13, %rs9, %rs10;
|
||||
; SM90-NEXT: add.s16 %rs14, %rs11, %rs12;
|
||||
; SM90-NEXT: add.s16 %rs15, %rs13, %rs14;
|
||||
; SM90-NEXT: cvt.u32.u16 %r9, %rs15;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r9;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v16i16(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b16 %rs<16>;
|
||||
; SM100-NEXT: .reg .b32 %r<10>;
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM100-NEXT: mov.b32 {%rs1, _}, %r8;
|
||||
; SM100-NEXT: mov.b32 {%rs2, _}, %r7;
|
||||
; SM100-NEXT: mov.b32 {%rs3, _}, %r6;
|
||||
; SM100-NEXT: mov.b32 {%rs4, _}, %r5;
|
||||
; SM100-NEXT: mov.b32 {%rs5, _}, %r4;
|
||||
; SM100-NEXT: mov.b32 {%rs6, _}, %r3;
|
||||
; SM100-NEXT: mov.b32 {%rs7, _}, %r2;
|
||||
; SM100-NEXT: mov.b32 {%rs8, _}, %r1;
|
||||
; SM100-NEXT: add.s16 %rs9, %rs8, %rs7;
|
||||
; SM100-NEXT: add.s16 %rs10, %rs6, %rs5;
|
||||
; SM100-NEXT: add.s16 %rs11, %rs4, %rs3;
|
||||
; SM100-NEXT: add.s16 %rs12, %rs2, %rs1;
|
||||
; SM100-NEXT: add.s16 %rs13, %rs9, %rs10;
|
||||
; SM100-NEXT: add.s16 %rs14, %rs11, %rs12;
|
||||
; SM100-NEXT: add.s16 %rs15, %rs13, %rs14;
|
||||
; SM100-NEXT: cvt.u32.u16 %r9, %rs15;
|
||||
; SM100-NEXT: st.param.b32 [func_retval0], %r9;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <16 x i16>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <16 x i16> %a, i32 0
|
||||
%v2 = extractelement <16 x i16> %a, i32 2
|
||||
%v3 = extractelement <16 x i16> %a, i32 4
|
||||
%v4 = extractelement <16 x i16> %a, i32 6
|
||||
%v5 = extractelement <16 x i16> %a, i32 8
|
||||
%v6 = extractelement <16 x i16> %a, i32 10
|
||||
%v7 = extractelement <16 x i16> %a, i32 12
|
||||
%v8 = extractelement <16 x i16> %a, i32 14
|
||||
%sum1 = add i16 %v1, %v2
|
||||
%sum2 = add i16 %v3, %v4
|
||||
%sum3 = add i16 %v5, %v6
|
||||
%sum4 = add i16 %v7, %v8
|
||||
%sum5 = add i16 %sum1, %sum2
|
||||
%sum6 = add i16 %sum3, %sum4
|
||||
%sum7 = add i16 %sum5, %sum6
|
||||
ret i16 %sum7
|
||||
}
|
||||
|
||||
define half @ld_global_v16f16(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v16f16(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b16 %rs<16>;
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
|
||||
; SM90-NEXT: mov.b32 {%rs1, _}, %r4;
|
||||
; SM90-NEXT: mov.b32 {%rs2, _}, %r3;
|
||||
; SM90-NEXT: mov.b32 {%rs3, _}, %r2;
|
||||
; SM90-NEXT: mov.b32 {%rs4, _}, %r1;
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM90-NEXT: mov.b32 {%rs5, _}, %r8;
|
||||
; SM90-NEXT: mov.b32 {%rs6, _}, %r7;
|
||||
; SM90-NEXT: mov.b32 {%rs7, _}, %r6;
|
||||
; SM90-NEXT: mov.b32 {%rs8, _}, %r5;
|
||||
; SM90-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
|
||||
; SM90-NEXT: add.rn.f16 %rs10, %rs6, %rs5;
|
||||
; SM90-NEXT: add.rn.f16 %rs11, %rs4, %rs3;
|
||||
; SM90-NEXT: add.rn.f16 %rs12, %rs2, %rs1;
|
||||
; SM90-NEXT: add.rn.f16 %rs13, %rs9, %rs10;
|
||||
; SM90-NEXT: add.rn.f16 %rs14, %rs11, %rs12;
|
||||
; SM90-NEXT: add.rn.f16 %rs15, %rs13, %rs14;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs15;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v16f16(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b16 %rs<16>;
|
||||
; SM100-NEXT: .reg .b32 %r<9>;
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM100-NEXT: mov.b32 {%rs1, _}, %r8;
|
||||
; SM100-NEXT: mov.b32 {%rs2, _}, %r7;
|
||||
; SM100-NEXT: mov.b32 {%rs3, _}, %r6;
|
||||
; SM100-NEXT: mov.b32 {%rs4, _}, %r5;
|
||||
; SM100-NEXT: mov.b32 {%rs5, _}, %r4;
|
||||
; SM100-NEXT: mov.b32 {%rs6, _}, %r3;
|
||||
; SM100-NEXT: mov.b32 {%rs7, _}, %r2;
|
||||
; SM100-NEXT: mov.b32 {%rs8, _}, %r1;
|
||||
; SM100-NEXT: add.rn.f16 %rs9, %rs8, %rs7;
|
||||
; SM100-NEXT: add.rn.f16 %rs10, %rs6, %rs5;
|
||||
; SM100-NEXT: add.rn.f16 %rs11, %rs4, %rs3;
|
||||
; SM100-NEXT: add.rn.f16 %rs12, %rs2, %rs1;
|
||||
; SM100-NEXT: add.rn.f16 %rs13, %rs9, %rs10;
|
||||
; SM100-NEXT: add.rn.f16 %rs14, %rs11, %rs12;
|
||||
; SM100-NEXT: add.rn.f16 %rs15, %rs13, %rs14;
|
||||
; SM100-NEXT: st.param.b16 [func_retval0], %rs15;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <16 x half>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <16 x half> %a, i32 0
|
||||
%v2 = extractelement <16 x half> %a, i32 2
|
||||
%v3 = extractelement <16 x half> %a, i32 4
|
||||
%v4 = extractelement <16 x half> %a, i32 6
|
||||
%v5 = extractelement <16 x half> %a, i32 8
|
||||
%v6 = extractelement <16 x half> %a, i32 10
|
||||
%v7 = extractelement <16 x half> %a, i32 12
|
||||
%v8 = extractelement <16 x half> %a, i32 14
|
||||
%sum1 = fadd half %v1, %v2
|
||||
%sum2 = fadd half %v3, %v4
|
||||
%sum3 = fadd half %v5, %v6
|
||||
%sum4 = fadd half %v7, %v8
|
||||
%sum5 = fadd half %sum1, %sum2
|
||||
%sum6 = fadd half %sum3, %sum4
|
||||
%sum7 = fadd half %sum5, %sum6
|
||||
ret half %sum7
|
||||
}
|
||||
|
||||
define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v16bf16(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b16 %rs<16>;
|
||||
; SM90-NEXT: .reg .b32 %r<9>;
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
|
||||
; SM90-NEXT: mov.b32 {%rs1, _}, %r4;
|
||||
; SM90-NEXT: mov.b32 {%rs2, _}, %r3;
|
||||
; SM90-NEXT: mov.b32 {%rs3, _}, %r2;
|
||||
; SM90-NEXT: mov.b32 {%rs4, _}, %r1;
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM90-NEXT: mov.b32 {%rs5, _}, %r8;
|
||||
; SM90-NEXT: mov.b32 {%rs6, _}, %r7;
|
||||
; SM90-NEXT: mov.b32 {%rs7, _}, %r6;
|
||||
; SM90-NEXT: mov.b32 {%rs8, _}, %r5;
|
||||
; SM90-NEXT: add.rn.bf16 %rs9, %rs8, %rs7;
|
||||
; SM90-NEXT: add.rn.bf16 %rs10, %rs6, %rs5;
|
||||
; SM90-NEXT: add.rn.bf16 %rs11, %rs4, %rs3;
|
||||
; SM90-NEXT: add.rn.bf16 %rs12, %rs2, %rs1;
|
||||
; SM90-NEXT: add.rn.bf16 %rs13, %rs9, %rs10;
|
||||
; SM90-NEXT: add.rn.bf16 %rs14, %rs11, %rs12;
|
||||
; SM90-NEXT: add.rn.bf16 %rs15, %rs13, %rs14;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs15;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v16bf16(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b16 %rs<16>;
|
||||
; SM100-NEXT: .reg .b32 %r<9>;
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM100-NEXT: mov.b32 {%rs1, _}, %r8;
|
||||
; SM100-NEXT: mov.b32 {%rs2, _}, %r7;
|
||||
; SM100-NEXT: mov.b32 {%rs3, _}, %r6;
|
||||
; SM100-NEXT: mov.b32 {%rs4, _}, %r5;
|
||||
; SM100-NEXT: mov.b32 {%rs5, _}, %r4;
|
||||
; SM100-NEXT: mov.b32 {%rs6, _}, %r3;
|
||||
; SM100-NEXT: mov.b32 {%rs7, _}, %r2;
|
||||
; SM100-NEXT: mov.b32 {%rs8, _}, %r1;
|
||||
; SM100-NEXT: add.rn.bf16 %rs9, %rs8, %rs7;
|
||||
; SM100-NEXT: add.rn.bf16 %rs10, %rs6, %rs5;
|
||||
; SM100-NEXT: add.rn.bf16 %rs11, %rs4, %rs3;
|
||||
; SM100-NEXT: add.rn.bf16 %rs12, %rs2, %rs1;
|
||||
; SM100-NEXT: add.rn.bf16 %rs13, %rs9, %rs10;
|
||||
; SM100-NEXT: add.rn.bf16 %rs14, %rs11, %rs12;
|
||||
; SM100-NEXT: add.rn.bf16 %rs15, %rs13, %rs14;
|
||||
; SM100-NEXT: st.param.b16 [func_retval0], %rs15;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <16 x bfloat>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <16 x bfloat> %a, i32 0
|
||||
%v2 = extractelement <16 x bfloat> %a, i32 2
|
||||
%v3 = extractelement <16 x bfloat> %a, i32 4
|
||||
%v4 = extractelement <16 x bfloat> %a, i32 6
|
||||
%v5 = extractelement <16 x bfloat> %a, i32 8
|
||||
%v6 = extractelement <16 x bfloat> %a, i32 10
|
||||
%v7 = extractelement <16 x bfloat> %a, i32 12
|
||||
%v8 = extractelement <16 x bfloat> %a, i32 14
|
||||
%sum1 = fadd bfloat %v1, %v2
|
||||
%sum2 = fadd bfloat %v3, %v4
|
||||
%sum3 = fadd bfloat %v5, %v6
|
||||
%sum4 = fadd bfloat %v7, %v8
|
||||
%sum5 = fadd bfloat %sum1, %sum2
|
||||
%sum6 = fadd bfloat %sum3, %sum4
|
||||
%sum7 = fadd bfloat %sum5, %sum6
|
||||
ret bfloat %sum7
|
||||
}
|
||||
|
||||
define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v8i32(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %r<16>;
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM90-NEXT: add.s32 %r9, %r5, %r6;
|
||||
; SM90-NEXT: add.s32 %r10, %r7, %r8;
|
||||
; SM90-NEXT: add.s32 %r11, %r1, %r2;
|
||||
; SM90-NEXT: add.s32 %r12, %r3, %r4;
|
||||
; SM90-NEXT: add.s32 %r13, %r9, %r10;
|
||||
; SM90-NEXT: add.s32 %r14, %r11, %r12;
|
||||
; SM90-NEXT: add.s32 %r15, %r13, %r14;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r15;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v8i32(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b32 %r<16>;
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; SM100-NEXT: add.s32 %r9, %r1, %r2;
|
||||
; SM100-NEXT: add.s32 %r10, %r3, %r4;
|
||||
; SM100-NEXT: add.s32 %r11, %r5, %r6;
|
||||
; SM100-NEXT: add.s32 %r12, %r7, %r8;
|
||||
; SM100-NEXT: add.s32 %r13, %r9, %r10;
|
||||
; SM100-NEXT: add.s32 %r14, %r11, %r12;
|
||||
; SM100-NEXT: add.s32 %r15, %r13, %r14;
|
||||
; SM100-NEXT: st.param.b32 [func_retval0], %r15;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <8 x i32>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <8 x i32> %a, i32 0
|
||||
%v2 = extractelement <8 x i32> %a, i32 1
|
||||
%v3 = extractelement <8 x i32> %a, i32 2
|
||||
%v4 = extractelement <8 x i32> %a, i32 3
|
||||
%v5 = extractelement <8 x i32> %a, i32 4
|
||||
%v6 = extractelement <8 x i32> %a, i32 5
|
||||
%v7 = extractelement <8 x i32> %a, i32 6
|
||||
%v8 = extractelement <8 x i32> %a, i32 7
|
||||
%sum1 = add i32 %v1, %v2
|
||||
%sum2 = add i32 %v3, %v4
|
||||
%sum3 = add i32 %v5, %v6
|
||||
%sum4 = add i32 %v7, %v8
|
||||
%sum5 = add i32 %sum1, %sum2
|
||||
%sum6 = add i32 %sum3, %sum4
|
||||
%sum7 = add i32 %sum5, %sum6
|
||||
|
||||
ret i32 %sum7
|
||||
}
|
||||
|
||||
define float @ld_global_v8f32(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v8f32(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b32 %f<16>;
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1+16];
|
||||
; SM90-NEXT: ld.global.nc.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1];
|
||||
; SM90-NEXT: add.rn.f32 %f9, %f5, %f6;
|
||||
; SM90-NEXT: add.rn.f32 %f10, %f7, %f8;
|
||||
; SM90-NEXT: add.rn.f32 %f11, %f1, %f2;
|
||||
; SM90-NEXT: add.rn.f32 %f12, %f3, %f4;
|
||||
; SM90-NEXT: add.rn.f32 %f13, %f9, %f10;
|
||||
; SM90-NEXT: add.rn.f32 %f14, %f11, %f12;
|
||||
; SM90-NEXT: add.rn.f32 %f15, %f13, %f14;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %f15;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v8f32(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b32 %f<16>;
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
|
||||
; SM100-NEXT: add.rn.f32 %f9, %f1, %f2;
|
||||
; SM100-NEXT: add.rn.f32 %f10, %f3, %f4;
|
||||
; SM100-NEXT: add.rn.f32 %f11, %f5, %f6;
|
||||
; SM100-NEXT: add.rn.f32 %f12, %f7, %f8;
|
||||
; SM100-NEXT: add.rn.f32 %f13, %f9, %f10;
|
||||
; SM100-NEXT: add.rn.f32 %f14, %f11, %f12;
|
||||
; SM100-NEXT: add.rn.f32 %f15, %f13, %f14;
|
||||
; SM100-NEXT: st.param.b32 [func_retval0], %f15;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <8 x float> %a, i32 0
|
||||
%v2 = extractelement <8 x float> %a, i32 1
|
||||
%v3 = extractelement <8 x float> %a, i32 2
|
||||
%v4 = extractelement <8 x float> %a, i32 3
|
||||
%v5 = extractelement <8 x float> %a, i32 4
|
||||
%v6 = extractelement <8 x float> %a, i32 5
|
||||
%v7 = extractelement <8 x float> %a, i32 6
|
||||
%v8 = extractelement <8 x float> %a, i32 7
|
||||
%sum1 = fadd float %v1, %v2
|
||||
%sum2 = fadd float %v3, %v4
|
||||
%sum3 = fadd float %v5, %v6
|
||||
%sum4 = fadd float %v7, %v8
|
||||
%sum5 = fadd float %sum1, %sum2
|
||||
%sum6 = fadd float %sum3, %sum4
|
||||
%sum7 = fadd float %sum5, %sum6
|
||||
|
||||
ret float %sum7
|
||||
}
|
||||
|
||||
define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v4i64(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b64 %rd<9>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16];
|
||||
; SM90-NEXT: ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1];
|
||||
; SM90-NEXT: add.s64 %rd6, %rd4, %rd5;
|
||||
; SM90-NEXT: add.s64 %rd7, %rd2, %rd3;
|
||||
; SM90-NEXT: add.s64 %rd8, %rd6, %rd7;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %rd8;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v4i64(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<9>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
|
||||
; SM100-NEXT: add.s64 %rd6, %rd2, %rd3;
|
||||
; SM100-NEXT: add.s64 %rd7, %rd4, %rd5;
|
||||
; SM100-NEXT: add.s64 %rd8, %rd6, %rd7;
|
||||
; SM100-NEXT: st.param.b64 [func_retval0], %rd8;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <4 x i64>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <4 x i64> %a, i32 0
|
||||
%v2 = extractelement <4 x i64> %a, i32 1
|
||||
%v3 = extractelement <4 x i64> %a, i32 2
|
||||
%v4 = extractelement <4 x i64> %a, i32 3
|
||||
%sum1 = add i64 %v1, %v2
|
||||
%sum2 = add i64 %v3, %v4
|
||||
%sum3 = add i64 %sum1, %sum2
|
||||
ret i64 %sum3
|
||||
}
|
||||
|
||||
define double @ld_global_v4f64(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: ld_global_v4f64(
|
||||
; SM90: {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-NEXT: .reg .b64 %fd<8>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0];
|
||||
; SM90-NEXT: ld.global.nc.v2.b64 {%fd1, %fd2}, [%rd1+16];
|
||||
; SM90-NEXT: ld.global.nc.v2.b64 {%fd3, %fd4}, [%rd1];
|
||||
; SM90-NEXT: add.rn.f64 %fd5, %fd3, %fd4;
|
||||
; SM90-NEXT: add.rn.f64 %fd6, %fd1, %fd2;
|
||||
; SM90-NEXT: add.rn.f64 %fd7, %fd5, %fd6;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %fd7;
|
||||
; SM90-NEXT: ret;
|
||||
;
|
||||
; SM100-LABEL: ld_global_v4f64(
|
||||
; SM100: {
|
||||
; SM100-NEXT: .reg .b64 %rd<2>;
|
||||
; SM100-NEXT: .reg .b64 %fd<8>;
|
||||
; SM100-EMPTY:
|
||||
; SM100-NEXT: // %bb.0:
|
||||
; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0];
|
||||
; SM100-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
|
||||
; SM100-NEXT: add.rn.f64 %fd5, %fd1, %fd2;
|
||||
; SM100-NEXT: add.rn.f64 %fd6, %fd3, %fd4;
|
||||
; SM100-NEXT: add.rn.f64 %fd7, %fd5, %fd6;
|
||||
; SM100-NEXT: st.param.b64 [func_retval0], %fd7;
|
||||
; SM100-NEXT: ret;
|
||||
%a = load <4 x double>, ptr addrspace(1) %ptr, !invariant.load !0
|
||||
%v1 = extractelement <4 x double> %a, i32 0
|
||||
%v2 = extractelement <4 x double> %a, i32 1
|
||||
%v3 = extractelement <4 x double> %a, i32 2
|
||||
%v4 = extractelement <4 x double> %a, i32 3
|
||||
%sum1 = fadd double %v1, %v2
|
||||
%sum2 = fadd double %v3, %v4
|
||||
%sum3 = fadd double %sum1, %sum2
|
||||
ret double %sum3
|
||||
}
|
||||
|
||||
!0 = !{}
|
||||
549
llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
Normal file
549
llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll
Normal file
@@ -0,0 +1,549 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX
|
||||
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
|
||||
|
||||
; In this test, we check that all the addressing modes are lowered correctly
|
||||
; for 256-bit invariant loads, which get lowered to ld.global.nc
|
||||
; addr can be any of the following:
|
||||
; - avar : direct address
|
||||
; - asi: direct address + offset
|
||||
; - areg_64: 64-bit register
|
||||
; - ari_64: 64-bit register + offset
|
||||
; Since this is a blackwell+ feature,
|
||||
; and support for 32-bit addressing does not exist after sm_90,
|
||||
; the "areg" and "ari" 32-bit addressing modes are not tested or supported.
|
||||
|
||||
; For invariant loads, asi is historically not supported,
|
||||
; and instead it is selected as move into register, add of offset, and loaded as areg64
|
||||
|
||||
; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double
|
||||
|
||||
; Global is the only address space that currently supports 256-bit loads/stores
|
||||
|
||||
@globalin = external addrspace(1) global ptr
|
||||
@globalout = external addrspace(1) global ptr
|
||||
|
||||
define void @avar_i8() {
|
||||
; PTX-LABEL: avar_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <32 x i8>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <32 x i8> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_i16() {
|
||||
; PTX-LABEL: avar_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x i16>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <16 x i16> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_half() {
|
||||
; PTX-LABEL: avar_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x half>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <16 x half> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_bfloat() {
|
||||
; PTX-LABEL: avar_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <16 x bfloat> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_i32() {
|
||||
; PTX-LABEL: avar_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <8 x i32> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_i64() {
|
||||
; PTX-LABEL: avar_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x i64>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <4 x i64> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_float() {
|
||||
; PTX-LABEL: avar_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <8 x float> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_double() {
|
||||
; PTX-LABEL: avar_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0
|
||||
store <4 x double> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i8() {
|
||||
; PTX-LABEL: asi_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <32 x i8> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i16() {
|
||||
; PTX-LABEL: asi_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <16 x i16> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_half() {
|
||||
; PTX-LABEL: asi_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <16 x half> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_bfloat() {
|
||||
; PTX-LABEL: asi_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i32() {
|
||||
; PTX-LABEL: asi_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <8 x i32> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i64() {
|
||||
; PTX-LABEL: asi_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <4 x i64> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_float() {
|
||||
; PTX-LABEL: asi_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <8 x float> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_double() {
|
||||
; PTX-LABEL: asi_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <4 x double> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <32 x i8>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <32 x i8> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x i16>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <16 x i16> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x half>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <16 x half> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <16 x bfloat> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <8 x i32> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<7>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x i64>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <4 x i64> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <8 x float> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0];
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_double_param_1];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x double>, ptr addrspace(1) %in, !invariant.load !0
|
||||
store <4 x double> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <32 x i8> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <16 x i16> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <16 x half> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <8 x i32> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<7>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <4 x i64> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <8 x float> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1];
|
||||
; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <4 x double> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
!0 = !{}
|
||||
543
llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
Normal file
543
llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll
Normal file
@@ -0,0 +1,543 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX
|
||||
; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %}
|
||||
|
||||
; In this test, we check that all the addressing modes are lowered correctly,
|
||||
; addr can be any of the following:
|
||||
; - avar : direct address
|
||||
; - asi: direct address + offset
|
||||
; - areg_64: 64-bit register
|
||||
; - ari_64: 64-bit register + offset
|
||||
; Since this is a blackwell+ feature,
|
||||
; and support for 32-bit addressing does not exist after sm_90,
|
||||
; the "areg" and "ari" 32-bit addressing modes are not tested or supported.
|
||||
|
||||
; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double
|
||||
|
||||
; Global is the only address space that currently supports 256-bit loads/stores
|
||||
|
||||
@globalin = external addrspace(1) global ptr
|
||||
@globalout = external addrspace(1) global ptr
|
||||
|
||||
define void @avar_i8() {
|
||||
; PTX-LABEL: avar_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <32 x i8>, ptr addrspace(1) @globalin
|
||||
store <32 x i8> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_i16() {
|
||||
; PTX-LABEL: avar_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x i16>, ptr addrspace(1) @globalin
|
||||
store <16 x i16> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_half() {
|
||||
; PTX-LABEL: avar_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x half>, ptr addrspace(1) @globalin
|
||||
store <16 x half> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_bfloat() {
|
||||
; PTX-LABEL: avar_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) @globalin
|
||||
store <16 x bfloat> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_i32() {
|
||||
; PTX-LABEL: avar_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x i32>, ptr addrspace(1) @globalin
|
||||
store <8 x i32> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_i64() {
|
||||
; PTX-LABEL: avar_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x i64>, ptr addrspace(1) @globalin
|
||||
store <4 x i64> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_float() {
|
||||
; PTX-LABEL: avar_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x float>, ptr addrspace(1) @globalin
|
||||
store <8 x float> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @avar_double() {
|
||||
; PTX-LABEL: avar_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x double>, ptr addrspace(1) @globalin
|
||||
store <4 x double> %load, ptr addrspace(1) @globalout
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i8() {
|
||||
; PTX-LABEL: asi_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <32 x i8>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <32 x i8> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i16() {
|
||||
; PTX-LABEL: asi_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <16 x i16>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <16 x i16> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_half() {
|
||||
; PTX-LABEL: asi_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <16 x half>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <16 x half> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_bfloat() {
|
||||
; PTX-LABEL: asi_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i32() {
|
||||
; PTX-LABEL: asi_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <8 x i32>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <8 x i32> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_i64() {
|
||||
; PTX-LABEL: asi_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <4 x i64>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <4 x i64> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_float() {
|
||||
; PTX-LABEL: asi_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <8 x float>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <8 x float> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @asi_double() {
|
||||
; PTX-LABEL: asi_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32
|
||||
%load = load <4 x double>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32
|
||||
store <4 x double> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <32 x i8>, ptr addrspace(1) %in
|
||||
store <32 x i8> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x i16>, ptr addrspace(1) %in
|
||||
store <16 x i16> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x half>, ptr addrspace(1) %in
|
||||
store <16 x half> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) %in
|
||||
store <16 x bfloat> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x i32>, ptr addrspace(1) %in
|
||||
store <8 x i32> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<7>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0];
|
||||
; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x i64>, ptr addrspace(1) %in
|
||||
store <4 x i64> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <8 x float>, ptr addrspace(1) %in
|
||||
store <8 x float> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: areg_64_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0];
|
||||
; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [areg_64_double_param_1];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%load = load <4 x double>, ptr addrspace(1) %in
|
||||
store <4 x double> %load, ptr addrspace(1) %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i8(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <32 x i8>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <32 x i8> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i16(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <16 x i16>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <16 x i16> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_half(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <16 x half>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <16 x half> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_bfloat(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <16 x bfloat>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <16 x bfloat> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i32(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %r<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <8 x i32>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <8 x i32> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_i64(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<7>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1];
|
||||
; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <4 x i64>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <4 x i64> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_float(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b32 %f<9>;
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1];
|
||||
; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <8 x float>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <8 x float> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) {
|
||||
; PTX-LABEL: ari_64_double(
|
||||
; PTX: {
|
||||
; PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; PTX-NEXT: .reg .b64 %fd<5>;
|
||||
; PTX-EMPTY:
|
||||
; PTX-NEXT: // %bb.0:
|
||||
; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0];
|
||||
; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1];
|
||||
; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32];
|
||||
; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4};
|
||||
; PTX-NEXT: ret;
|
||||
%in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32
|
||||
%load = load <4 x double>, ptr addrspace(1) %in.offset
|
||||
%out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32
|
||||
store <4 x double> %load, ptr addrspace(1) %out.offset
|
||||
ret void
|
||||
}
|
||||
1442
llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
Normal file
1442
llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,728 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
|
||||
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_90 -mattr=+ptx87 -S < %s | FileCheck %s -check-prefixes=CHECK,SM90
|
||||
; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s -check-prefixes=CHECK,SM100
|
||||
|
||||
; 256 bit loads/stores are only currently supported for:
|
||||
; - global
|
||||
; - blackwell (sm_100)
|
||||
; - ptx 8.8
|
||||
; - 32/64-bit types
|
||||
|
||||
; Currently, the LSV produces 256 bit loads/stores if the first three conditions
|
||||
; are satisfied, as the backend will either upsize or split vectors
|
||||
; of smaller elements in the Type Legalization stage.
|
||||
|
||||
; In this file, we test i8, i16, i32, i64, f32, f64.
|
||||
; The other floating point types are omitted for simplicity.
|
||||
; We also test the negative case for non-global i32.
|
||||
|
||||
define void @int8x32(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: define void @int8x32(
|
||||
; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; SM90-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM90-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 16
|
||||
; SM90-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[L01:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
|
||||
; SM90-NEXT: [[L110:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
|
||||
; SM90-NEXT: [[L211:%.*]] = extractelement <16 x i8> [[TMP1]], i32 2
|
||||
; SM90-NEXT: [[L312:%.*]] = extractelement <16 x i8> [[TMP1]], i32 3
|
||||
; SM90-NEXT: [[L413:%.*]] = extractelement <16 x i8> [[TMP1]], i32 4
|
||||
; SM90-NEXT: [[L514:%.*]] = extractelement <16 x i8> [[TMP1]], i32 5
|
||||
; SM90-NEXT: [[L615:%.*]] = extractelement <16 x i8> [[TMP1]], i32 6
|
||||
; SM90-NEXT: [[L716:%.*]] = extractelement <16 x i8> [[TMP1]], i32 7
|
||||
; SM90-NEXT: [[L817:%.*]] = extractelement <16 x i8> [[TMP1]], i32 8
|
||||
; SM90-NEXT: [[L918:%.*]] = extractelement <16 x i8> [[TMP1]], i32 9
|
||||
; SM90-NEXT: [[LA19:%.*]] = extractelement <16 x i8> [[TMP1]], i32 10
|
||||
; SM90-NEXT: [[LB20:%.*]] = extractelement <16 x i8> [[TMP1]], i32 11
|
||||
; SM90-NEXT: [[LC21:%.*]] = extractelement <16 x i8> [[TMP1]], i32 12
|
||||
; SM90-NEXT: [[LD22:%.*]] = extractelement <16 x i8> [[TMP1]], i32 13
|
||||
; SM90-NEXT: [[LE23:%.*]] = extractelement <16 x i8> [[TMP1]], i32 14
|
||||
; SM90-NEXT: [[LF24:%.*]] = extractelement <16 x i8> [[TMP1]], i32 15
|
||||
; SM90-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr addrspace(1) [[PTR10]], align 16
|
||||
; SM90-NEXT: [[L1025:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0
|
||||
; SM90-NEXT: [[L1126:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1
|
||||
; SM90-NEXT: [[L1227:%.*]] = extractelement <16 x i8> [[TMP2]], i32 2
|
||||
; SM90-NEXT: [[L1328:%.*]] = extractelement <16 x i8> [[TMP2]], i32 3
|
||||
; SM90-NEXT: [[L1429:%.*]] = extractelement <16 x i8> [[TMP2]], i32 4
|
||||
; SM90-NEXT: [[L1530:%.*]] = extractelement <16 x i8> [[TMP2]], i32 5
|
||||
; SM90-NEXT: [[L1631:%.*]] = extractelement <16 x i8> [[TMP2]], i32 6
|
||||
; SM90-NEXT: [[L1732:%.*]] = extractelement <16 x i8> [[TMP2]], i32 7
|
||||
; SM90-NEXT: [[L1833:%.*]] = extractelement <16 x i8> [[TMP2]], i32 8
|
||||
; SM90-NEXT: [[L1934:%.*]] = extractelement <16 x i8> [[TMP2]], i32 9
|
||||
; SM90-NEXT: [[L1A35:%.*]] = extractelement <16 x i8> [[TMP2]], i32 10
|
||||
; SM90-NEXT: [[L1B36:%.*]] = extractelement <16 x i8> [[TMP2]], i32 11
|
||||
; SM90-NEXT: [[L1C37:%.*]] = extractelement <16 x i8> [[TMP2]], i32 12
|
||||
; SM90-NEXT: [[L1D38:%.*]] = extractelement <16 x i8> [[TMP2]], i32 13
|
||||
; SM90-NEXT: [[L1E39:%.*]] = extractelement <16 x i8> [[TMP2]], i32 14
|
||||
; SM90-NEXT: [[L1F40:%.*]] = extractelement <16 x i8> [[TMP2]], i32 15
|
||||
; SM90-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[LB20]], i32 0
|
||||
; SM90-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[LA19]], i32 1
|
||||
; SM90-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L918]], i32 2
|
||||
; SM90-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L817]], i32 3
|
||||
; SM90-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[L716]], i32 4
|
||||
; SM90-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[L615]], i32 5
|
||||
; SM90-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[L514]], i32 6
|
||||
; SM90-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[L413]], i32 7
|
||||
; SM90-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[L312]], i32 8
|
||||
; SM90-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[L211]], i32 9
|
||||
; SM90-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L110]], i32 10
|
||||
; SM90-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L01]], i32 11
|
||||
; SM90-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[LF24]], i32 12
|
||||
; SM90-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[LE23]], i32 13
|
||||
; SM90-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP16]], i8 [[LD22]], i32 14
|
||||
; SM90-NEXT: [[TMP18:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[LC21]], i32 15
|
||||
; SM90-NEXT: store <16 x i8> [[TMP18]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[TMP19:%.*]] = insertelement <16 x i8> poison, i8 [[L1B36]], i32 0
|
||||
; SM90-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[L1A35]], i32 1
|
||||
; SM90-NEXT: [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[L1934]], i32 2
|
||||
; SM90-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[L1833]], i32 3
|
||||
; SM90-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[L1732]], i32 4
|
||||
; SM90-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[L1631]], i32 5
|
||||
; SM90-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[L1530]], i32 6
|
||||
; SM90-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[L1429]], i32 7
|
||||
; SM90-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[L1328]], i32 8
|
||||
; SM90-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[L1227]], i32 9
|
||||
; SM90-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[L1126]], i32 10
|
||||
; SM90-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[L1025]], i32 11
|
||||
; SM90-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[L1F40]], i32 12
|
||||
; SM90-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[L1E39]], i32 13
|
||||
; SM90-NEXT: [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[L1D38]], i32 14
|
||||
; SM90-NEXT: [[TMP34:%.*]] = insertelement <16 x i8> [[TMP33]], i8 [[L1C37]], i32 15
|
||||
; SM90-NEXT: store <16 x i8> [[TMP34]], ptr addrspace(1) [[PTR10]], align 16
|
||||
; SM90-NEXT: ret void
|
||||
;
|
||||
; SM100-LABEL: define void @int8x32(
|
||||
; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; SM100-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM100-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: [[L01:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0
|
||||
; SM100-NEXT: [[L110:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1
|
||||
; SM100-NEXT: [[L211:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2
|
||||
; SM100-NEXT: [[L312:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3
|
||||
; SM100-NEXT: [[L413:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4
|
||||
; SM100-NEXT: [[L514:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5
|
||||
; SM100-NEXT: [[L615:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6
|
||||
; SM100-NEXT: [[L716:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7
|
||||
; SM100-NEXT: [[L817:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8
|
||||
; SM100-NEXT: [[L918:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9
|
||||
; SM100-NEXT: [[LA19:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10
|
||||
; SM100-NEXT: [[LB20:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11
|
||||
; SM100-NEXT: [[LC21:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12
|
||||
; SM100-NEXT: [[LD22:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13
|
||||
; SM100-NEXT: [[LE23:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14
|
||||
; SM100-NEXT: [[LF24:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15
|
||||
; SM100-NEXT: [[L1025:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16
|
||||
; SM100-NEXT: [[L1126:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17
|
||||
; SM100-NEXT: [[L1227:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18
|
||||
; SM100-NEXT: [[L1328:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19
|
||||
; SM100-NEXT: [[L1429:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20
|
||||
; SM100-NEXT: [[L1530:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21
|
||||
; SM100-NEXT: [[L1631:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22
|
||||
; SM100-NEXT: [[L1732:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23
|
||||
; SM100-NEXT: [[L1833:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24
|
||||
; SM100-NEXT: [[L1934:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25
|
||||
; SM100-NEXT: [[L1A35:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26
|
||||
; SM100-NEXT: [[L1B36:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27
|
||||
; SM100-NEXT: [[L1C37:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28
|
||||
; SM100-NEXT: [[L1D38:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29
|
||||
; SM100-NEXT: [[L1E39:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30
|
||||
; SM100-NEXT: [[L1F40:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31
|
||||
; SM100-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> poison, i8 [[LB20]], i32 0
|
||||
; SM100-NEXT: [[TMP3:%.*]] = insertelement <32 x i8> [[TMP2]], i8 [[LA19]], i32 1
|
||||
; SM100-NEXT: [[TMP4:%.*]] = insertelement <32 x i8> [[TMP3]], i8 [[L918]], i32 2
|
||||
; SM100-NEXT: [[TMP5:%.*]] = insertelement <32 x i8> [[TMP4]], i8 [[L817]], i32 3
|
||||
; SM100-NEXT: [[TMP6:%.*]] = insertelement <32 x i8> [[TMP5]], i8 [[L716]], i32 4
|
||||
; SM100-NEXT: [[TMP7:%.*]] = insertelement <32 x i8> [[TMP6]], i8 [[L615]], i32 5
|
||||
; SM100-NEXT: [[TMP8:%.*]] = insertelement <32 x i8> [[TMP7]], i8 [[L514]], i32 6
|
||||
; SM100-NEXT: [[TMP9:%.*]] = insertelement <32 x i8> [[TMP8]], i8 [[L413]], i32 7
|
||||
; SM100-NEXT: [[TMP10:%.*]] = insertelement <32 x i8> [[TMP9]], i8 [[L312]], i32 8
|
||||
; SM100-NEXT: [[TMP11:%.*]] = insertelement <32 x i8> [[TMP10]], i8 [[L211]], i32 9
|
||||
; SM100-NEXT: [[TMP12:%.*]] = insertelement <32 x i8> [[TMP11]], i8 [[L110]], i32 10
|
||||
; SM100-NEXT: [[TMP13:%.*]] = insertelement <32 x i8> [[TMP12]], i8 [[L01]], i32 11
|
||||
; SM100-NEXT: [[TMP14:%.*]] = insertelement <32 x i8> [[TMP13]], i8 [[LF24]], i32 12
|
||||
; SM100-NEXT: [[TMP15:%.*]] = insertelement <32 x i8> [[TMP14]], i8 [[LE23]], i32 13
|
||||
; SM100-NEXT: [[TMP16:%.*]] = insertelement <32 x i8> [[TMP15]], i8 [[LD22]], i32 14
|
||||
; SM100-NEXT: [[TMP17:%.*]] = insertelement <32 x i8> [[TMP16]], i8 [[LC21]], i32 15
|
||||
; SM100-NEXT: [[TMP18:%.*]] = insertelement <32 x i8> [[TMP17]], i8 [[L1B36]], i32 16
|
||||
; SM100-NEXT: [[TMP19:%.*]] = insertelement <32 x i8> [[TMP18]], i8 [[L1A35]], i32 17
|
||||
; SM100-NEXT: [[TMP20:%.*]] = insertelement <32 x i8> [[TMP19]], i8 [[L1934]], i32 18
|
||||
; SM100-NEXT: [[TMP21:%.*]] = insertelement <32 x i8> [[TMP20]], i8 [[L1833]], i32 19
|
||||
; SM100-NEXT: [[TMP22:%.*]] = insertelement <32 x i8> [[TMP21]], i8 [[L1732]], i32 20
|
||||
; SM100-NEXT: [[TMP23:%.*]] = insertelement <32 x i8> [[TMP22]], i8 [[L1631]], i32 21
|
||||
; SM100-NEXT: [[TMP24:%.*]] = insertelement <32 x i8> [[TMP23]], i8 [[L1530]], i32 22
|
||||
; SM100-NEXT: [[TMP25:%.*]] = insertelement <32 x i8> [[TMP24]], i8 [[L1429]], i32 23
|
||||
; SM100-NEXT: [[TMP26:%.*]] = insertelement <32 x i8> [[TMP25]], i8 [[L1328]], i32 24
|
||||
; SM100-NEXT: [[TMP27:%.*]] = insertelement <32 x i8> [[TMP26]], i8 [[L1227]], i32 25
|
||||
; SM100-NEXT: [[TMP28:%.*]] = insertelement <32 x i8> [[TMP27]], i8 [[L1126]], i32 26
|
||||
; SM100-NEXT: [[TMP29:%.*]] = insertelement <32 x i8> [[TMP28]], i8 [[L1025]], i32 27
|
||||
; SM100-NEXT: [[TMP30:%.*]] = insertelement <32 x i8> [[TMP29]], i8 [[L1F40]], i32 28
|
||||
; SM100-NEXT: [[TMP31:%.*]] = insertelement <32 x i8> [[TMP30]], i8 [[L1E39]], i32 29
|
||||
; SM100-NEXT: [[TMP32:%.*]] = insertelement <32 x i8> [[TMP31]], i8 [[L1D38]], i32 30
|
||||
; SM100-NEXT: [[TMP33:%.*]] = insertelement <32 x i8> [[TMP32]], i8 [[L1C37]], i32 31
|
||||
; SM100-NEXT: store <32 x i8> [[TMP33]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: ret void
|
||||
;
|
||||
|
||||
%ptr0 = getelementptr i8, ptr addrspace(1) %ptr, i64 0
|
||||
%ptr1 = getelementptr i8, ptr addrspace(1) %ptr, i64 1
|
||||
%ptr2 = getelementptr i8, ptr addrspace(1) %ptr, i64 2
|
||||
%ptr3 = getelementptr i8, ptr addrspace(1) %ptr, i64 3
|
||||
%ptr4 = getelementptr i8, ptr addrspace(1) %ptr, i64 4
|
||||
%ptr5 = getelementptr i8, ptr addrspace(1) %ptr, i64 5
|
||||
%ptr6 = getelementptr i8, ptr addrspace(1) %ptr, i64 6
|
||||
%ptr7 = getelementptr i8, ptr addrspace(1) %ptr, i64 7
|
||||
%ptr8 = getelementptr i8, ptr addrspace(1) %ptr, i64 8
|
||||
%ptr9 = getelementptr i8, ptr addrspace(1) %ptr, i64 9
|
||||
%ptra = getelementptr i8, ptr addrspace(1) %ptr, i64 10
|
||||
%ptrb = getelementptr i8, ptr addrspace(1) %ptr, i64 11
|
||||
%ptrc = getelementptr i8, ptr addrspace(1) %ptr, i64 12
|
||||
%ptrd = getelementptr i8, ptr addrspace(1) %ptr, i64 13
|
||||
%ptre = getelementptr i8, ptr addrspace(1) %ptr, i64 14
|
||||
%ptrf = getelementptr i8, ptr addrspace(1) %ptr, i64 15
|
||||
%ptr10 = getelementptr i8, ptr addrspace(1) %ptr, i64 16
|
||||
%ptr11 = getelementptr i8, ptr addrspace(1) %ptr, i64 17
|
||||
%ptr12 = getelementptr i8, ptr addrspace(1) %ptr, i64 18
|
||||
%ptr13 = getelementptr i8, ptr addrspace(1) %ptr, i64 19
|
||||
%ptr14 = getelementptr i8, ptr addrspace(1) %ptr, i64 20
|
||||
%ptr15 = getelementptr i8, ptr addrspace(1) %ptr, i64 21
|
||||
%ptr16 = getelementptr i8, ptr addrspace(1) %ptr, i64 22
|
||||
%ptr17 = getelementptr i8, ptr addrspace(1) %ptr, i64 23
|
||||
%ptr18 = getelementptr i8, ptr addrspace(1) %ptr, i64 24
|
||||
%ptr19 = getelementptr i8, ptr addrspace(1) %ptr, i64 25
|
||||
%ptr1a = getelementptr i8, ptr addrspace(1) %ptr, i64 26
|
||||
%ptr1b = getelementptr i8, ptr addrspace(1) %ptr, i64 27
|
||||
%ptr1c = getelementptr i8, ptr addrspace(1) %ptr, i64 28
|
||||
%ptr1d = getelementptr i8, ptr addrspace(1) %ptr, i64 29
|
||||
%ptr1e = getelementptr i8, ptr addrspace(1) %ptr, i64 30
|
||||
%ptr1f = getelementptr i8, ptr addrspace(1) %ptr, i64 31
|
||||
|
||||
%l0 = load i8, ptr addrspace(1) %ptr0, align 32
|
||||
%l1 = load i8, ptr addrspace(1) %ptr1
|
||||
%l2 = load i8, ptr addrspace(1) %ptr2
|
||||
%l3 = load i8, ptr addrspace(1) %ptr3
|
||||
%l4 = load i8, ptr addrspace(1) %ptr4
|
||||
%l5 = load i8, ptr addrspace(1) %ptr5
|
||||
%l6 = load i8, ptr addrspace(1) %ptr6
|
||||
%l7 = load i8, ptr addrspace(1) %ptr7
|
||||
%l8 = load i8, ptr addrspace(1) %ptr8
|
||||
%l9 = load i8, ptr addrspace(1) %ptr9
|
||||
%la = load i8, ptr addrspace(1) %ptra
|
||||
%lb = load i8, ptr addrspace(1) %ptrb
|
||||
%lc = load i8, ptr addrspace(1) %ptrc
|
||||
%ld = load i8, ptr addrspace(1) %ptrd
|
||||
%le = load i8, ptr addrspace(1) %ptre
|
||||
%lf = load i8, ptr addrspace(1) %ptrf
|
||||
%l10 = load i8, ptr addrspace(1) %ptr10, align 16
|
||||
%l11 = load i8, ptr addrspace(1) %ptr11
|
||||
%l12 = load i8, ptr addrspace(1) %ptr12
|
||||
%l13 = load i8, ptr addrspace(1) %ptr13
|
||||
%l14 = load i8, ptr addrspace(1) %ptr14
|
||||
%l15 = load i8, ptr addrspace(1) %ptr15
|
||||
%l16 = load i8, ptr addrspace(1) %ptr16
|
||||
%l17 = load i8, ptr addrspace(1) %ptr17
|
||||
%l18 = load i8, ptr addrspace(1) %ptr18
|
||||
%l19 = load i8, ptr addrspace(1) %ptr19
|
||||
%l1a = load i8, ptr addrspace(1) %ptr1a
|
||||
%l1b = load i8, ptr addrspace(1) %ptr1b
|
||||
%l1c = load i8, ptr addrspace(1) %ptr1c
|
||||
%l1d = load i8, ptr addrspace(1) %ptr1d
|
||||
%l1e = load i8, ptr addrspace(1) %ptr1e
|
||||
%l1f = load i8, ptr addrspace(1) %ptr1f
|
||||
|
||||
store i8 %lf, ptr addrspace(1) %ptrc
|
||||
store i8 %le, ptr addrspace(1) %ptrd
|
||||
store i8 %ld, ptr addrspace(1) %ptre
|
||||
store i8 %lc, ptr addrspace(1) %ptrf
|
||||
store i8 %lb, ptr addrspace(1) %ptr0, align 32
|
||||
store i8 %la, ptr addrspace(1) %ptr1
|
||||
store i8 %l9, ptr addrspace(1) %ptr2
|
||||
store i8 %l8, ptr addrspace(1) %ptr3
|
||||
store i8 %l7, ptr addrspace(1) %ptr4
|
||||
store i8 %l6, ptr addrspace(1) %ptr5
|
||||
store i8 %l5, ptr addrspace(1) %ptr6
|
||||
store i8 %l4, ptr addrspace(1) %ptr7
|
||||
store i8 %l3, ptr addrspace(1) %ptr8
|
||||
store i8 %l2, ptr addrspace(1) %ptr9
|
||||
store i8 %l1, ptr addrspace(1) %ptra
|
||||
store i8 %l0, ptr addrspace(1) %ptrb
|
||||
store i8 %l1f, ptr addrspace(1) %ptr1c
|
||||
store i8 %l1e, ptr addrspace(1) %ptr1d
|
||||
store i8 %l1d, ptr addrspace(1) %ptr1e
|
||||
store i8 %l1c, ptr addrspace(1) %ptr1f
|
||||
store i8 %l1b, ptr addrspace(1) %ptr10, align 16
|
||||
store i8 %l1a, ptr addrspace(1) %ptr11
|
||||
store i8 %l19, ptr addrspace(1) %ptr12
|
||||
store i8 %l18, ptr addrspace(1) %ptr13
|
||||
store i8 %l17, ptr addrspace(1) %ptr14
|
||||
store i8 %l16, ptr addrspace(1) %ptr15
|
||||
store i8 %l15, ptr addrspace(1) %ptr16
|
||||
store i8 %l14, ptr addrspace(1) %ptr17
|
||||
store i8 %l13, ptr addrspace(1) %ptr18
|
||||
store i8 %l12, ptr addrspace(1) %ptr19
|
||||
store i8 %l11, ptr addrspace(1) %ptr1a
|
||||
store i8 %l10, ptr addrspace(1) %ptr1b
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @int16x16(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: define void @int16x16(
|
||||
; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM90-NEXT: [[PTR0:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM90-NEXT: [[PTR8:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 8
|
||||
; SM90-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[L01:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0
|
||||
; SM90-NEXT: [[L12:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1
|
||||
; SM90-NEXT: [[L23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2
|
||||
; SM90-NEXT: [[L34:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3
|
||||
; SM90-NEXT: [[L45:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4
|
||||
; SM90-NEXT: [[L56:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5
|
||||
; SM90-NEXT: [[L67:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6
|
||||
; SM90-NEXT: [[L78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7
|
||||
; SM90-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr addrspace(1) [[PTR8]], align 16
|
||||
; SM90-NEXT: [[L89:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0
|
||||
; SM90-NEXT: [[L910:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1
|
||||
; SM90-NEXT: [[LA11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2
|
||||
; SM90-NEXT: [[LB12:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3
|
||||
; SM90-NEXT: [[LC13:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4
|
||||
; SM90-NEXT: [[LD14:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5
|
||||
; SM90-NEXT: [[LE15:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6
|
||||
; SM90-NEXT: [[LF16:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7
|
||||
; SM90-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[LB12]], i32 0
|
||||
; SM90-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[LA11]], i32 1
|
||||
; SM90-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[L910]], i32 2
|
||||
; SM90-NEXT: [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[L89]], i32 3
|
||||
; SM90-NEXT: [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[L78]], i32 4
|
||||
; SM90-NEXT: [[TMP8:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[L67]], i32 5
|
||||
; SM90-NEXT: [[TMP9:%.*]] = insertelement <8 x i16> [[TMP8]], i16 [[L56]], i32 6
|
||||
; SM90-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> [[TMP9]], i16 [[L45]], i32 7
|
||||
; SM90-NEXT: store <8 x i16> [[TMP10]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> poison, i16 [[L34]], i32 0
|
||||
; SM90-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP11]], i16 [[L23]], i32 1
|
||||
; SM90-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[L12]], i32 2
|
||||
; SM90-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[L01]], i32 3
|
||||
; SM90-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[LF16]], i32 4
|
||||
; SM90-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[LE15]], i32 5
|
||||
; SM90-NEXT: [[TMP17:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[LD14]], i32 6
|
||||
; SM90-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[LC13]], i32 7
|
||||
; SM90-NEXT: store <8 x i16> [[TMP18]], ptr addrspace(1) [[PTR8]], align 16
|
||||
; SM90-NEXT: ret void
|
||||
;
|
||||
; SM100-LABEL: define void @int16x16(
|
||||
; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM100-NEXT: [[PTR0:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM100-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: [[L01:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0
|
||||
; SM100-NEXT: [[L12:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1
|
||||
; SM100-NEXT: [[L23:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2
|
||||
; SM100-NEXT: [[L34:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3
|
||||
; SM100-NEXT: [[L45:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4
|
||||
; SM100-NEXT: [[L56:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5
|
||||
; SM100-NEXT: [[L67:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6
|
||||
; SM100-NEXT: [[L78:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7
|
||||
; SM100-NEXT: [[L89:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8
|
||||
; SM100-NEXT: [[L910:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9
|
||||
; SM100-NEXT: [[LA11:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10
|
||||
; SM100-NEXT: [[LB12:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11
|
||||
; SM100-NEXT: [[LC13:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12
|
||||
; SM100-NEXT: [[LD14:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13
|
||||
; SM100-NEXT: [[LE15:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14
|
||||
; SM100-NEXT: [[LF16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15
|
||||
; SM100-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> poison, i16 [[LB12]], i32 0
|
||||
; SM100-NEXT: [[TMP3:%.*]] = insertelement <16 x i16> [[TMP2]], i16 [[LA11]], i32 1
|
||||
; SM100-NEXT: [[TMP4:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[L910]], i32 2
|
||||
; SM100-NEXT: [[TMP5:%.*]] = insertelement <16 x i16> [[TMP4]], i16 [[L89]], i32 3
|
||||
; SM100-NEXT: [[TMP6:%.*]] = insertelement <16 x i16> [[TMP5]], i16 [[L78]], i32 4
|
||||
; SM100-NEXT: [[TMP7:%.*]] = insertelement <16 x i16> [[TMP6]], i16 [[L67]], i32 5
|
||||
; SM100-NEXT: [[TMP8:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[L56]], i32 6
|
||||
; SM100-NEXT: [[TMP9:%.*]] = insertelement <16 x i16> [[TMP8]], i16 [[L45]], i32 7
|
||||
; SM100-NEXT: [[TMP10:%.*]] = insertelement <16 x i16> [[TMP9]], i16 [[L34]], i32 8
|
||||
; SM100-NEXT: [[TMP11:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[L23]], i32 9
|
||||
; SM100-NEXT: [[TMP12:%.*]] = insertelement <16 x i16> [[TMP11]], i16 [[L12]], i32 10
|
||||
; SM100-NEXT: [[TMP13:%.*]] = insertelement <16 x i16> [[TMP12]], i16 [[L01]], i32 11
|
||||
; SM100-NEXT: [[TMP14:%.*]] = insertelement <16 x i16> [[TMP13]], i16 [[LF16]], i32 12
|
||||
; SM100-NEXT: [[TMP15:%.*]] = insertelement <16 x i16> [[TMP14]], i16 [[LE15]], i32 13
|
||||
; SM100-NEXT: [[TMP16:%.*]] = insertelement <16 x i16> [[TMP15]], i16 [[LD14]], i32 14
|
||||
; SM100-NEXT: [[TMP17:%.*]] = insertelement <16 x i16> [[TMP16]], i16 [[LC13]], i32 15
|
||||
; SM100-NEXT: store <16 x i16> [[TMP17]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: ret void
|
||||
;
|
||||
|
||||
%ptr0 = getelementptr i16, ptr addrspace(1) %ptr, i64 0
|
||||
%ptr1 = getelementptr i16, ptr addrspace(1) %ptr, i64 1
|
||||
%ptr2 = getelementptr i16, ptr addrspace(1) %ptr, i64 2
|
||||
%ptr3 = getelementptr i16, ptr addrspace(1) %ptr, i64 3
|
||||
%ptr4 = getelementptr i16, ptr addrspace(1) %ptr, i64 4
|
||||
%ptr5 = getelementptr i16, ptr addrspace(1) %ptr, i64 5
|
||||
%ptr6 = getelementptr i16, ptr addrspace(1) %ptr, i64 6
|
||||
%ptr7 = getelementptr i16, ptr addrspace(1) %ptr, i64 7
|
||||
%ptr8 = getelementptr i16, ptr addrspace(1) %ptr, i64 8
|
||||
%ptr9 = getelementptr i16, ptr addrspace(1) %ptr, i64 9
|
||||
%ptra = getelementptr i16, ptr addrspace(1) %ptr, i64 10
|
||||
%ptrb = getelementptr i16, ptr addrspace(1) %ptr, i64 11
|
||||
%ptrc = getelementptr i16, ptr addrspace(1) %ptr, i64 12
|
||||
%ptrd = getelementptr i16, ptr addrspace(1) %ptr, i64 13
|
||||
%ptre = getelementptr i16, ptr addrspace(1) %ptr, i64 14
|
||||
%ptrf = getelementptr i16, ptr addrspace(1) %ptr, i64 15
|
||||
|
||||
%l0 = load i16, ptr addrspace(1) %ptr0, align 32
|
||||
%l1 = load i16, ptr addrspace(1) %ptr1
|
||||
%l2 = load i16, ptr addrspace(1) %ptr2
|
||||
%l3 = load i16, ptr addrspace(1) %ptr3
|
||||
%l4 = load i16, ptr addrspace(1) %ptr4
|
||||
%l5 = load i16, ptr addrspace(1) %ptr5
|
||||
%l6 = load i16, ptr addrspace(1) %ptr6
|
||||
%l7 = load i16, ptr addrspace(1) %ptr7
|
||||
%l8 = load i16, ptr addrspace(1) %ptr8, align 16
|
||||
%l9 = load i16, ptr addrspace(1) %ptr9
|
||||
%la = load i16, ptr addrspace(1) %ptra
|
||||
%lb = load i16, ptr addrspace(1) %ptrb
|
||||
%lc = load i16, ptr addrspace(1) %ptrc
|
||||
%ld = load i16, ptr addrspace(1) %ptrd
|
||||
%le = load i16, ptr addrspace(1) %ptre
|
||||
%lf = load i16, ptr addrspace(1) %ptrf
|
||||
|
||||
store i16 %lf, ptr addrspace(1) %ptrc
|
||||
store i16 %le, ptr addrspace(1) %ptrd
|
||||
store i16 %ld, ptr addrspace(1) %ptre
|
||||
store i16 %lc, ptr addrspace(1) %ptrf
|
||||
store i16 %lb, ptr addrspace(1) %ptr0, align 32
|
||||
store i16 %la, ptr addrspace(1) %ptr1
|
||||
store i16 %l9, ptr addrspace(1) %ptr2
|
||||
store i16 %l8, ptr addrspace(1) %ptr3
|
||||
store i16 %l7, ptr addrspace(1) %ptr4
|
||||
store i16 %l6, ptr addrspace(1) %ptr5
|
||||
store i16 %l5, ptr addrspace(1) %ptr6
|
||||
store i16 %l4, ptr addrspace(1) %ptr7
|
||||
store i16 %l3, ptr addrspace(1) %ptr8, align 16
|
||||
store i16 %l2, ptr addrspace(1) %ptr9
|
||||
store i16 %l1, ptr addrspace(1) %ptra
|
||||
store i16 %l0, ptr addrspace(1) %ptrb
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @int32x8(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: define void @int32x8(
|
||||
; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM90-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM90-NEXT: [[PTR4:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
|
||||
; SM90-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[L01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
|
||||
; SM90-NEXT: [[L12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
|
||||
; SM90-NEXT: [[L23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
|
||||
; SM90-NEXT: [[L34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
|
||||
; SM90-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR4]], align 16
|
||||
; SM90-NEXT: [[L45:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
|
||||
; SM90-NEXT: [[L56:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
|
||||
; SM90-NEXT: [[L67:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
|
||||
; SM90-NEXT: [[L78:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
|
||||
; SM90-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[L78]], i32 0
|
||||
; SM90-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[L67]], i32 1
|
||||
; SM90-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[L56]], i32 2
|
||||
; SM90-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[L45]], i32 3
|
||||
; SM90-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[L34]], i32 0
|
||||
; SM90-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[L23]], i32 1
|
||||
; SM90-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[L12]], i32 2
|
||||
; SM90-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[L01]], i32 3
|
||||
; SM90-NEXT: store <4 x i32> [[TMP10]], ptr addrspace(1) [[PTR4]], align 16
|
||||
; SM90-NEXT: ret void
|
||||
;
|
||||
; SM100-LABEL: define void @int32x8(
|
||||
; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM100-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM100-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: [[L01:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0
|
||||
; SM100-NEXT: [[L12:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1
|
||||
; SM100-NEXT: [[L23:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2
|
||||
; SM100-NEXT: [[L34:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3
|
||||
; SM100-NEXT: [[L45:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4
|
||||
; SM100-NEXT: [[L56:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5
|
||||
; SM100-NEXT: [[L67:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6
|
||||
; SM100-NEXT: [[L78:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7
|
||||
; SM100-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[L78]], i32 0
|
||||
; SM100-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[L67]], i32 1
|
||||
; SM100-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[L56]], i32 2
|
||||
; SM100-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[L45]], i32 3
|
||||
; SM100-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[L34]], i32 4
|
||||
; SM100-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[L23]], i32 5
|
||||
; SM100-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[L12]], i32 6
|
||||
; SM100-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[L01]], i32 7
|
||||
; SM100-NEXT: store <8 x i32> [[TMP9]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: ret void
|
||||
;
|
||||
|
||||
%ptr0 = getelementptr i32, ptr addrspace(1) %ptr, i64 0
|
||||
%ptr1 = getelementptr i32, ptr addrspace(1) %ptr, i64 1
|
||||
%ptr2 = getelementptr i32, ptr addrspace(1) %ptr, i64 2
|
||||
%ptr3 = getelementptr i32, ptr addrspace(1) %ptr, i64 3
|
||||
%ptr4 = getelementptr i32, ptr addrspace(1) %ptr, i64 4
|
||||
%ptr5 = getelementptr i32, ptr addrspace(1) %ptr, i64 5
|
||||
%ptr6 = getelementptr i32, ptr addrspace(1) %ptr, i64 6
|
||||
%ptr7 = getelementptr i32, ptr addrspace(1) %ptr, i64 7
|
||||
|
||||
%l0 = load i32, ptr addrspace(1) %ptr0, align 32
|
||||
%l1 = load i32, ptr addrspace(1) %ptr1
|
||||
%l2 = load i32, ptr addrspace(1) %ptr2
|
||||
%l3 = load i32, ptr addrspace(1) %ptr3
|
||||
%l4 = load i32, ptr addrspace(1) %ptr4, align 16
|
||||
%l5 = load i32, ptr addrspace(1) %ptr5
|
||||
%l6 = load i32, ptr addrspace(1) %ptr6
|
||||
%l7 = load i32, ptr addrspace(1) %ptr7
|
||||
|
||||
store i32 %l7, ptr addrspace(1) %ptr0, align 32
|
||||
store i32 %l6, ptr addrspace(1) %ptr1
|
||||
store i32 %l5, ptr addrspace(1) %ptr2
|
||||
store i32 %l4, ptr addrspace(1) %ptr3
|
||||
store i32 %l3, ptr addrspace(1) %ptr4, align 16
|
||||
store i32 %l2, ptr addrspace(1) %ptr5
|
||||
store i32 %l1, ptr addrspace(1) %ptr6
|
||||
store i32 %l0, ptr addrspace(1) %ptr7
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @int64x4(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: define void @int64x4(
|
||||
; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM90-NEXT: [[PTR0:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM90-NEXT: [[PTR2:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 2
|
||||
; SM90-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[L01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
|
||||
; SM90-NEXT: [[L12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
|
||||
; SM90-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR2]], align 16
|
||||
; SM90-NEXT: [[L23:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
|
||||
; SM90-NEXT: [[L34:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
|
||||
; SM90-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[L34]], i32 0
|
||||
; SM90-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[L23]], i32 1
|
||||
; SM90-NEXT: store <2 x i64> [[TMP4]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[L12]], i32 0
|
||||
; SM90-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[L01]], i32 1
|
||||
; SM90-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[PTR2]], align 16
|
||||
; SM90-NEXT: ret void
|
||||
;
|
||||
; SM100-LABEL: define void @int64x4(
|
||||
; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM100-NEXT: [[PTR0:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM100-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: [[L01:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0
|
||||
; SM100-NEXT: [[L12:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1
|
||||
; SM100-NEXT: [[L23:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2
|
||||
; SM100-NEXT: [[L34:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3
|
||||
; SM100-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> poison, i64 [[L34]], i32 0
|
||||
; SM100-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[L23]], i32 1
|
||||
; SM100-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[L12]], i32 2
|
||||
; SM100-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[L01]], i32 3
|
||||
; SM100-NEXT: store <4 x i64> [[TMP5]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: ret void
|
||||
;
|
||||
|
||||
%ptr0 = getelementptr i64, ptr addrspace(1) %ptr, i64 0
|
||||
%ptr1 = getelementptr i64, ptr addrspace(1) %ptr, i64 1
|
||||
%ptr2 = getelementptr i64, ptr addrspace(1) %ptr, i64 2
|
||||
%ptr3 = getelementptr i64, ptr addrspace(1) %ptr, i64 3
|
||||
|
||||
%l0 = load i64, ptr addrspace(1) %ptr0, align 32
|
||||
%l1 = load i64, ptr addrspace(1) %ptr1
|
||||
%l2 = load i64, ptr addrspace(1) %ptr2, align 16
|
||||
%l3 = load i64, ptr addrspace(1) %ptr3
|
||||
|
||||
store i64 %l3, ptr addrspace(1) %ptr0, align 32
|
||||
store i64 %l2, ptr addrspace(1) %ptr1
|
||||
store i64 %l1, ptr addrspace(1) %ptr2, align 16
|
||||
store i64 %l0, ptr addrspace(1) %ptr3
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @float32x8(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: define void @float32x8(
|
||||
; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM90-NEXT: [[PTR0:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM90-NEXT: [[PTR4:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 4
|
||||
; SM90-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[L01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
|
||||
; SM90-NEXT: [[L12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
|
||||
; SM90-NEXT: [[L23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
|
||||
; SM90-NEXT: [[L34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
|
||||
; SM90-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr addrspace(1) [[PTR4]], align 16
|
||||
; SM90-NEXT: [[L45:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
|
||||
; SM90-NEXT: [[L56:%.*]] = extractelement <4 x float> [[TMP2]], i32 1
|
||||
; SM90-NEXT: [[L67:%.*]] = extractelement <4 x float> [[TMP2]], i32 2
|
||||
; SM90-NEXT: [[L78:%.*]] = extractelement <4 x float> [[TMP2]], i32 3
|
||||
; SM90-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[L78]], i32 0
|
||||
; SM90-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[L67]], i32 1
|
||||
; SM90-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[L56]], i32 2
|
||||
; SM90-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[L45]], i32 3
|
||||
; SM90-NEXT: store <4 x float> [[TMP6]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[L34]], i32 0
|
||||
; SM90-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[L23]], i32 1
|
||||
; SM90-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[L12]], i32 2
|
||||
; SM90-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[L01]], i32 3
|
||||
; SM90-NEXT: store <4 x float> [[TMP10]], ptr addrspace(1) [[PTR4]], align 16
|
||||
; SM90-NEXT: ret void
|
||||
;
|
||||
; SM100-LABEL: define void @float32x8(
|
||||
; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM100-NEXT: [[PTR0:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM100-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: [[L01:%.*]] = extractelement <8 x float> [[TMP1]], i32 0
|
||||
; SM100-NEXT: [[L12:%.*]] = extractelement <8 x float> [[TMP1]], i32 1
|
||||
; SM100-NEXT: [[L23:%.*]] = extractelement <8 x float> [[TMP1]], i32 2
|
||||
; SM100-NEXT: [[L34:%.*]] = extractelement <8 x float> [[TMP1]], i32 3
|
||||
; SM100-NEXT: [[L45:%.*]] = extractelement <8 x float> [[TMP1]], i32 4
|
||||
; SM100-NEXT: [[L56:%.*]] = extractelement <8 x float> [[TMP1]], i32 5
|
||||
; SM100-NEXT: [[L67:%.*]] = extractelement <8 x float> [[TMP1]], i32 6
|
||||
; SM100-NEXT: [[L78:%.*]] = extractelement <8 x float> [[TMP1]], i32 7
|
||||
; SM100-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[L78]], i32 0
|
||||
; SM100-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[L67]], i32 1
|
||||
; SM100-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[L56]], i32 2
|
||||
; SM100-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[L45]], i32 3
|
||||
; SM100-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[L34]], i32 4
|
||||
; SM100-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[L23]], i32 5
|
||||
; SM100-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[L12]], i32 6
|
||||
; SM100-NEXT: [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[L01]], i32 7
|
||||
; SM100-NEXT: store <8 x float> [[TMP9]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: ret void
|
||||
;
|
||||
|
||||
%ptr0 = getelementptr float, ptr addrspace(1) %ptr, i64 0
|
||||
%ptr1 = getelementptr float, ptr addrspace(1) %ptr, i64 1
|
||||
%ptr2 = getelementptr float, ptr addrspace(1) %ptr, i64 2
|
||||
%ptr3 = getelementptr float, ptr addrspace(1) %ptr, i64 3
|
||||
%ptr4 = getelementptr float, ptr addrspace(1) %ptr, i64 4
|
||||
%ptr5 = getelementptr float, ptr addrspace(1) %ptr, i64 5
|
||||
%ptr6 = getelementptr float, ptr addrspace(1) %ptr, i64 6
|
||||
%ptr7 = getelementptr float, ptr addrspace(1) %ptr, i64 7
|
||||
|
||||
%l0 = load float, ptr addrspace(1) %ptr0, align 32
|
||||
%l1 = load float, ptr addrspace(1) %ptr1
|
||||
%l2 = load float, ptr addrspace(1) %ptr2
|
||||
%l3 = load float, ptr addrspace(1) %ptr3
|
||||
%l4 = load float, ptr addrspace(1) %ptr4, align 16
|
||||
%l5 = load float, ptr addrspace(1) %ptr5
|
||||
%l6 = load float, ptr addrspace(1) %ptr6
|
||||
%l7 = load float, ptr addrspace(1) %ptr7
|
||||
|
||||
store float %l7, ptr addrspace(1) %ptr0, align 32
|
||||
store float %l6, ptr addrspace(1) %ptr1
|
||||
store float %l5, ptr addrspace(1) %ptr2
|
||||
store float %l4, ptr addrspace(1) %ptr3
|
||||
store float %l3, ptr addrspace(1) %ptr4, align 16
|
||||
store float %l2, ptr addrspace(1) %ptr5
|
||||
store float %l1, ptr addrspace(1) %ptr6
|
||||
store float %l0, ptr addrspace(1) %ptr7
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @float64x4(ptr addrspace(1) %ptr) {
|
||||
; SM90-LABEL: define void @float64x4(
|
||||
; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM90-NEXT: [[PTR0:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM90-NEXT: [[PTR2:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 2
|
||||
; SM90-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[L01:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
|
||||
; SM90-NEXT: [[L12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
|
||||
; SM90-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr addrspace(1) [[PTR2]], align 16
|
||||
; SM90-NEXT: [[L23:%.*]] = extractelement <2 x double> [[TMP2]], i32 0
|
||||
; SM90-NEXT: [[L34:%.*]] = extractelement <2 x double> [[TMP2]], i32 1
|
||||
; SM90-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[L34]], i32 0
|
||||
; SM90-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[L23]], i32 1
|
||||
; SM90-NEXT: store <2 x double> [[TMP4]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM90-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[L12]], i32 0
|
||||
; SM90-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[L01]], i32 1
|
||||
; SM90-NEXT: store <2 x double> [[TMP6]], ptr addrspace(1) [[PTR2]], align 16
|
||||
; SM90-NEXT: ret void
|
||||
;
|
||||
; SM100-LABEL: define void @float64x4(
|
||||
; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] {
|
||||
; SM100-NEXT: [[PTR0:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 0
|
||||
; SM100-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: [[L01:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
|
||||
; SM100-NEXT: [[L12:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
|
||||
; SM100-NEXT: [[L23:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
|
||||
; SM100-NEXT: [[L34:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
|
||||
; SM100-NEXT: [[TMP2:%.*]] = insertelement <4 x double> poison, double [[L34]], i32 0
|
||||
; SM100-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[L23]], i32 1
|
||||
; SM100-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[L12]], i32 2
|
||||
; SM100-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[L01]], i32 3
|
||||
; SM100-NEXT: store <4 x double> [[TMP5]], ptr addrspace(1) [[PTR0]], align 32
|
||||
; SM100-NEXT: ret void
|
||||
;
|
||||
|
||||
%ptr0 = getelementptr double, ptr addrspace(1) %ptr, i64 0
|
||||
%ptr1 = getelementptr double, ptr addrspace(1) %ptr, i64 1
|
||||
%ptr2 = getelementptr double, ptr addrspace(1) %ptr, i64 2
|
||||
%ptr3 = getelementptr double, ptr addrspace(1) %ptr, i64 3
|
||||
|
||||
%l0 = load double, ptr addrspace(1) %ptr0, align 32
|
||||
%l1 = load double, ptr addrspace(1) %ptr1
|
||||
%l2 = load double, ptr addrspace(1) %ptr2, align 16
|
||||
%l3 = load double, ptr addrspace(1) %ptr3
|
||||
|
||||
store double %l3, ptr addrspace(1) %ptr0, align 32
|
||||
store double %l2, ptr addrspace(1) %ptr1
|
||||
store double %l1, ptr addrspace(1) %ptr2, align 16
|
||||
store double %l0, ptr addrspace(1) %ptr3
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @int32x8_non_global(ptr %ptr) {
|
||||
; CHECK-LABEL: define void @int32x8_non_global(
|
||||
; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
|
||||
; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr [[PTR]], i64 0
|
||||
; CHECK-NEXT: [[PTR4:%.*]] = getelementptr i32, ptr [[PTR]], i64 4
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR0]], align 32
|
||||
; CHECK-NEXT: [[L01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
|
||||
; CHECK-NEXT: [[L12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
|
||||
; CHECK-NEXT: [[L23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
|
||||
; CHECK-NEXT: [[L34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[PTR4]], align 16
|
||||
; CHECK-NEXT: [[L45:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0
|
||||
; CHECK-NEXT: [[L56:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1
|
||||
; CHECK-NEXT: [[L67:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2
|
||||
; CHECK-NEXT: [[L78:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[L78]], i32 0
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[L67]], i32 1
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[L56]], i32 2
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[L45]], i32 3
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[PTR0]], align 32
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[L34]], i32 0
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[L23]], i32 1
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[L12]], i32 2
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[L01]], i32 3
|
||||
; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR4]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
|
||||
%ptr0 = getelementptr i32, ptr %ptr, i64 0
|
||||
%ptr1 = getelementptr i32, ptr %ptr, i64 1
|
||||
%ptr2 = getelementptr i32, ptr %ptr, i64 2
|
||||
%ptr3 = getelementptr i32, ptr %ptr, i64 3
|
||||
%ptr4 = getelementptr i32, ptr %ptr, i64 4
|
||||
%ptr5 = getelementptr i32, ptr %ptr, i64 5
|
||||
%ptr6 = getelementptr i32, ptr %ptr, i64 6
|
||||
%ptr7 = getelementptr i32, ptr %ptr, i64 7
|
||||
|
||||
%l0 = load i32, ptr %ptr0, align 32
|
||||
%l1 = load i32, ptr %ptr1
|
||||
%l2 = load i32, ptr %ptr2
|
||||
%l3 = load i32, ptr %ptr3
|
||||
%l4 = load i32, ptr %ptr4, align 16
|
||||
%l5 = load i32, ptr %ptr5
|
||||
%l6 = load i32, ptr %ptr6
|
||||
%l7 = load i32, ptr %ptr7
|
||||
|
||||
store i32 %l7, ptr %ptr0, align 32
|
||||
store i32 %l6, ptr %ptr1
|
||||
store i32 %l5, ptr %ptr2
|
||||
store i32 %l4, ptr %ptr3
|
||||
store i32 %l3, ptr %ptr4, align 16
|
||||
store i32 %l2, ptr %ptr5
|
||||
store i32 %l1, ptr %ptr6
|
||||
store i32 %l0, ptr %ptr7
|
||||
|
||||
ret void
|
||||
}
|
||||
Reference in New Issue
Block a user