[flang] Add a subset of PowerPC MMA (Matrix Multiply Accelerate) intrinsics
This patch includes the a subset of MMA intrinsics that are included in the mma intrinsic module: mma_assemble_acc mma_assemble_pair mma_build_acc mma_disassemble_acc mma_disassemble_pair Submit on behalf of Daniel Chen <cdchen@ca.ibm.com> Differential Revision: https://reviews.llvm.org/D155725
This commit is contained in:
@@ -45,6 +45,21 @@ enum class VecOp {
|
||||
Xor
|
||||
};
|
||||
|
||||
/// Enums used to templatize and share lowering of PowerPC MMA intrinsics.
|
||||
enum class MMAOp {
|
||||
AssembleAcc,
|
||||
AssemblePair,
|
||||
DisassembleAcc,
|
||||
DisassemblePair,
|
||||
};
|
||||
|
||||
enum class MMAHandlerOp {
|
||||
NoOp,
|
||||
SubToFunc,
|
||||
SubToFuncReverseArgOnLE,
|
||||
FirstArgIsResult,
|
||||
};
|
||||
|
||||
// Wrapper struct to encapsulate information for a vector type. Preserves
|
||||
// sign of eleTy if eleTy is signed/unsigned integer. Helps with vector type
|
||||
// conversions.
|
||||
@@ -120,13 +135,16 @@ struct PPCIntrinsicLibrary : IntrinsicLibrary {
|
||||
PPCIntrinsicLibrary() = delete;
|
||||
PPCIntrinsicLibrary(const PPCIntrinsicLibrary &) = delete;
|
||||
|
||||
// PPC MMA intrinsic generic handler
|
||||
template <MMAOp IntrId, MMAHandlerOp HandlerOp>
|
||||
void genMmaIntr(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
|
||||
// PPC intrinsic handlers.
|
||||
template <bool isImm>
|
||||
void genMtfsf(llvm::ArrayRef<fir::ExtendedValue>);
|
||||
|
||||
fir::ExtendedValue genVecAbs(mlir::Type resultType,
|
||||
llvm::ArrayRef<fir::ExtendedValue> args);
|
||||
|
||||
template <VecOp>
|
||||
fir::ExtendedValue
|
||||
genVecAddAndMulSubXor(mlir::Type resultType,
|
||||
|
||||
@@ -5746,6 +5746,9 @@ getIntrinsicArgumentLowering(llvm::StringRef specificName) {
|
||||
if (const IntrinsicHandler *handler = findIntrinsicHandler(name))
|
||||
if (!handler->argLoweringRules.hasDefaultRules())
|
||||
return &handler->argLoweringRules;
|
||||
if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name))
|
||||
if (!ppcHandler->argLoweringRules.hasDefaultRules())
|
||||
return &ppcHandler->argLoweringRules;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
||||
@@ -14,8 +14,8 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "flang/Optimizer/Builder/PPCIntrinsicCall.h"
|
||||
#include "flang/Evaluate/common.h"
|
||||
#include "flang/Optimizer/Builder/FIRBuilder.h"
|
||||
#include "flang/Optimizer/Builder/IntrinsicCall.h"
|
||||
#include "flang/Optimizer/Builder/MutableBox.h"
|
||||
#include "mlir/Dialect/Vector/IR/VectorOps.h"
|
||||
|
||||
@@ -25,6 +25,40 @@ using PI = PPCIntrinsicLibrary;
|
||||
|
||||
// PPC specific intrinsic handlers.
|
||||
static constexpr IntrinsicHandler ppcHandlers[]{
|
||||
{"__ppc_mma_assemble_acc",
|
||||
static_cast<IntrinsicLibrary::SubroutineGenerator>(
|
||||
&PI::genMmaIntr<MMAOp::AssembleAcc, MMAHandlerOp::SubToFunc>),
|
||||
{{{"acc", asAddr},
|
||||
{"arg1", asValue},
|
||||
{"arg2", asValue},
|
||||
{"arg3", asValue},
|
||||
{"arg4", asValue}}},
|
||||
/*isElemental=*/true},
|
||||
{"__ppc_mma_assemble_pair",
|
||||
static_cast<IntrinsicLibrary::SubroutineGenerator>(
|
||||
&PI::genMmaIntr<MMAOp::AssemblePair, MMAHandlerOp::SubToFunc>),
|
||||
{{{"pair", asAddr}, {"arg1", asValue}, {"arg2", asValue}}},
|
||||
/*isElemental=*/true},
|
||||
{"__ppc_mma_build_acc",
|
||||
static_cast<IntrinsicLibrary::SubroutineGenerator>(
|
||||
&PI::genMmaIntr<MMAOp::AssembleAcc,
|
||||
MMAHandlerOp::SubToFuncReverseArgOnLE>),
|
||||
{{{"acc", asAddr},
|
||||
{"arg1", asValue},
|
||||
{"arg2", asValue},
|
||||
{"arg3", asValue},
|
||||
{"arg4", asValue}}},
|
||||
/*isElemental=*/true},
|
||||
{"__ppc_mma_disassemble_acc",
|
||||
static_cast<IntrinsicLibrary::SubroutineGenerator>(
|
||||
&PI::genMmaIntr<MMAOp::DisassembleAcc, MMAHandlerOp::SubToFunc>),
|
||||
{{{"data", asAddr}, {"acc", asValue}}},
|
||||
/*isElemental=*/true},
|
||||
{"__ppc_mma_disassemble_pair",
|
||||
static_cast<IntrinsicLibrary::SubroutineGenerator>(
|
||||
&PI::genMmaIntr<MMAOp::DisassemblePair, MMAHandlerOp::SubToFunc>),
|
||||
{{{"data", asAddr}, {"pair", asValue}}},
|
||||
/*isElemental=*/true},
|
||||
{"__ppc_mtfsf",
|
||||
static_cast<IntrinsicLibrary::SubroutineGenerator>(&PI::genMtfsf<false>),
|
||||
{{{"mask", asValue}, {"r", asValue}}},
|
||||
@@ -326,6 +360,103 @@ checkPPCMathOperationsRange(llvm::StringRef name) {
|
||||
return ppcMathOps.equal_range(name);
|
||||
}
|
||||
|
||||
static mlir::FunctionType genMmaVpFuncType(mlir::MLIRContext *context,
|
||||
int quadCnt, int pairCnt, int vecCnt,
|
||||
int intCnt = 0,
|
||||
int vecElemBitSize = 8,
|
||||
int intBitSize = 32) {
|
||||
// Constructs a function type with the following signature:
|
||||
// Result type: __vector_pair
|
||||
// Arguments:
|
||||
// quadCnt: number of arguments that has __vector_quad type, followed by
|
||||
// pairCnt: number of arguments that has __vector_pair type, followed by
|
||||
// vecCnt: number of arguments that has vector(integer) type, followed by
|
||||
// intCnt: number of arguments that has integer type
|
||||
// vecElemBitSize: specifies the size of vector elements in bits
|
||||
// intBitSize: specifies the size of integer arguments in bits
|
||||
auto vType{mlir::VectorType::get(
|
||||
128 / vecElemBitSize, mlir::IntegerType::get(context, vecElemBitSize))};
|
||||
auto vpType{fir::VectorType::get(256, mlir::IntegerType::get(context, 1))};
|
||||
auto vqType{fir::VectorType::get(512, mlir::IntegerType::get(context, 1))};
|
||||
auto iType{mlir::IntegerType::get(context, intBitSize)};
|
||||
llvm::SmallVector<mlir::Type> argTypes;
|
||||
for (int i = 0; i < quadCnt; ++i) {
|
||||
argTypes.push_back(vqType);
|
||||
}
|
||||
for (int i = 0; i < pairCnt; ++i) {
|
||||
argTypes.push_back(vpType);
|
||||
}
|
||||
for (int i = 0; i < vecCnt; ++i) {
|
||||
argTypes.push_back(vType);
|
||||
}
|
||||
for (int i = 0; i < intCnt; ++i) {
|
||||
argTypes.push_back(iType);
|
||||
}
|
||||
|
||||
return mlir::FunctionType::get(context, argTypes, {vpType});
|
||||
}
|
||||
|
||||
static mlir::FunctionType genMmaVqFuncType(mlir::MLIRContext *context,
|
||||
int quadCnt, int pairCnt, int vecCnt,
|
||||
int intCnt = 0,
|
||||
int vecElemBitSize = 8,
|
||||
int intBitSize = 32) {
|
||||
// Constructs a function type with the following signature:
|
||||
// Result type: __vector_quad
|
||||
// Arguments:
|
||||
// quadCnt: number of arguments that has __vector_quad type, followed by
|
||||
// pairCnt: number of arguments that has __vector_pair type, followed by
|
||||
// vecCnt: number of arguments that has vector(integer) type, followed by
|
||||
// intCnt: number of arguments that has integer type
|
||||
// vecElemBitSize: specifies the size of vector elements in bits
|
||||
// intBitSize: specifies the size of integer arguments in bits
|
||||
auto vType{mlir::VectorType::get(
|
||||
128 / vecElemBitSize, mlir::IntegerType::get(context, vecElemBitSize))};
|
||||
auto vpType{fir::VectorType::get(256, mlir::IntegerType::get(context, 1))};
|
||||
auto vqType{fir::VectorType::get(512, mlir::IntegerType::get(context, 1))};
|
||||
auto iType{mlir::IntegerType::get(context, intBitSize)};
|
||||
llvm::SmallVector<mlir::Type> argTypes;
|
||||
for (int i = 0; i < quadCnt; ++i) {
|
||||
argTypes.push_back(vqType);
|
||||
}
|
||||
for (int i = 0; i < pairCnt; ++i) {
|
||||
argTypes.push_back(vpType);
|
||||
}
|
||||
for (int i = 0; i < vecCnt; ++i) {
|
||||
argTypes.push_back(vType);
|
||||
}
|
||||
for (int i = 0; i < intCnt; ++i) {
|
||||
argTypes.push_back(iType);
|
||||
}
|
||||
|
||||
return mlir::FunctionType::get(context, argTypes, {vqType});
|
||||
}
|
||||
|
||||
mlir::FunctionType genMmaDisassembleFuncType(mlir::MLIRContext *context,
|
||||
MMAOp mmaOp) {
|
||||
auto vType{mlir::VectorType::get(16, mlir::IntegerType::get(context, 8))};
|
||||
llvm::SmallVector<mlir::Type> members;
|
||||
|
||||
if (mmaOp == MMAOp::DisassembleAcc) {
|
||||
auto vqType{fir::VectorType::get(512, mlir::IntegerType::get(context, 1))};
|
||||
members.push_back(vType);
|
||||
members.push_back(vType);
|
||||
members.push_back(vType);
|
||||
members.push_back(vType);
|
||||
auto resType{mlir::LLVM::LLVMStructType::getLiteral(context, members)};
|
||||
return mlir::FunctionType::get(context, {vqType}, {resType});
|
||||
} else if (mmaOp == MMAOp::DisassemblePair) {
|
||||
auto vpType{fir::VectorType::get(256, mlir::IntegerType::get(context, 1))};
|
||||
members.push_back(vType);
|
||||
members.push_back(vType);
|
||||
auto resType{mlir::LLVM::LLVMStructType::getLiteral(context, members)};
|
||||
return mlir::FunctionType::get(context, {vpType}, {resType});
|
||||
} else {
|
||||
llvm_unreachable(
|
||||
"Unsupported intrinsic code for function signature generator");
|
||||
}
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// PowerPC specific intrinsic handlers.
|
||||
//===----------------------------------------------------------------------===//
|
||||
@@ -1130,4 +1261,114 @@ PPCIntrinsicLibrary::genVecShift(mlir::Type resultType,
|
||||
return shftRes;
|
||||
}
|
||||
|
||||
const char *getMmaIrIntrName(MMAOp mmaOp) {
|
||||
switch (mmaOp) {
|
||||
case MMAOp::AssembleAcc:
|
||||
return "llvm.ppc.mma.assemble.acc";
|
||||
case MMAOp::AssemblePair:
|
||||
return "llvm.ppc.vsx.assemble.pair";
|
||||
case MMAOp::DisassembleAcc:
|
||||
return "llvm.ppc.mma.disassemble.acc";
|
||||
case MMAOp::DisassemblePair:
|
||||
return "llvm.ppc.vsx.disassemble.pair";
|
||||
}
|
||||
}
|
||||
|
||||
mlir::FunctionType getMmaIrFuncType(mlir::MLIRContext *context, MMAOp mmaOp) {
|
||||
switch (mmaOp) {
|
||||
case MMAOp::AssembleAcc:
|
||||
return genMmaVqFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 4);
|
||||
case MMAOp::AssemblePair:
|
||||
return genMmaVpFuncType(context, /*Quad*/ 0, /*Pair*/ 0, /*Vector*/ 2);
|
||||
case MMAOp::DisassembleAcc:
|
||||
return genMmaDisassembleFuncType(context, mmaOp);
|
||||
case MMAOp::DisassemblePair:
|
||||
return genMmaDisassembleFuncType(context, mmaOp);
|
||||
}
|
||||
}
|
||||
|
||||
template <MMAOp IntrId, MMAHandlerOp HandlerOp>
|
||||
void PPCIntrinsicLibrary::genMmaIntr(llvm::ArrayRef<fir::ExtendedValue> args) {
|
||||
auto context{builder.getContext()};
|
||||
mlir::FunctionType intrFuncType{getMmaIrFuncType(context, IntrId)};
|
||||
mlir::func::FuncOp funcOp{
|
||||
builder.addNamedFunction(loc, getMmaIrIntrName(IntrId), intrFuncType)};
|
||||
llvm::SmallVector<mlir::Value> intrArgs;
|
||||
|
||||
// Depending on SubToFunc, change the subroutine call to a function call.
|
||||
// First argument represents the result. Rest of the arguments
|
||||
// are shifted one position to form the actual argument list.
|
||||
size_t argStart{0};
|
||||
size_t argStep{1};
|
||||
size_t e{args.size()};
|
||||
if (HandlerOp == MMAHandlerOp::SubToFunc) {
|
||||
// The first argument becomes function result. Start from the second
|
||||
// argument.
|
||||
argStart = 1;
|
||||
} else if (HandlerOp == MMAHandlerOp::SubToFuncReverseArgOnLE) {
|
||||
// Reverse argument order on little-endian target only.
|
||||
// The reversal does not depend on the setting of non-native-order option.
|
||||
if (Fortran::evaluate::isHostLittleEndian) {
|
||||
// Load the arguments in reverse order.
|
||||
argStart = args.size() - 1;
|
||||
// The first argument becomes function result. Stop at the second
|
||||
// argument.
|
||||
e = 0;
|
||||
argStep = -1;
|
||||
} else {
|
||||
// Load the arguments in natural order.
|
||||
// The first argument becomes function result. Start from the second
|
||||
// argument.
|
||||
argStart = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = argStart, j = 0; i != e; i += argStep, ++j) {
|
||||
auto v{fir::getBase(args[i])};
|
||||
if (i == 0 && HandlerOp == MMAHandlerOp::FirstArgIsResult) {
|
||||
// First argument is passed in as an address. We need to load
|
||||
// the content to match the LLVM interface.
|
||||
v = builder.create<fir::LoadOp>(loc, v);
|
||||
}
|
||||
auto vType{v.getType()};
|
||||
mlir::Type targetType{intrFuncType.getInput(j)};
|
||||
if (vType != targetType) {
|
||||
if (targetType.isa<mlir::VectorType>()) {
|
||||
// Perform vector type conversion for arguments passed by value.
|
||||
auto eleTy{vType.dyn_cast<fir::VectorType>().getEleTy()};
|
||||
auto len{vType.dyn_cast<fir::VectorType>().getLen()};
|
||||
mlir::VectorType mlirType = mlir::VectorType::get(len, eleTy);
|
||||
auto v0{builder.createConvert(loc, mlirType, v)};
|
||||
auto v1{builder.create<mlir::vector::BitCastOp>(loc, targetType, v0)};
|
||||
intrArgs.push_back(v1);
|
||||
} else if (targetType.isa<mlir::IntegerType>() &&
|
||||
vType.isa<mlir::IntegerType>()) {
|
||||
auto v0{builder.createConvert(loc, targetType, v)};
|
||||
intrArgs.push_back(v0);
|
||||
} else {
|
||||
llvm::errs() << "\nUnexpected type conversion requested: "
|
||||
<< " from " << vType << " to " << targetType << "\n";
|
||||
llvm_unreachable("Unsupported type conversion for argument to PowerPC "
|
||||
"MMA intrinsic");
|
||||
}
|
||||
} else {
|
||||
intrArgs.push_back(v);
|
||||
}
|
||||
}
|
||||
auto callSt{builder.create<fir::CallOp>(loc, funcOp, intrArgs)};
|
||||
if (HandlerOp == MMAHandlerOp::SubToFunc ||
|
||||
HandlerOp == MMAHandlerOp::SubToFuncReverseArgOnLE ||
|
||||
HandlerOp == MMAHandlerOp::FirstArgIsResult) {
|
||||
// Convert pointer type if needed.
|
||||
mlir::Value callResult{callSt.getResult(0)};
|
||||
mlir::Value destPtr{fir::getBase(args[0])};
|
||||
mlir::Type callResultPtrType{builder.getRefType(callResult.getType())};
|
||||
if (destPtr.getType() != callResultPtrType) {
|
||||
destPtr = builder.create<fir::ConvertOp>(loc, callResultPtrType, destPtr);
|
||||
}
|
||||
// Copy the result.
|
||||
builder.create<fir::StoreOp>(loc, callResult, destPtr);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace fir
|
||||
|
||||
@@ -947,20 +947,21 @@ bool fir::ConvertOp::isPointerCompatible(mlir::Type ty) {
|
||||
}
|
||||
|
||||
static std::optional<mlir::Type> getVectorElementType(mlir::Type ty) {
|
||||
if (mlir::isa<fir::VectorType>(ty)) {
|
||||
auto elemTy = mlir::dyn_cast<fir::VectorType>(ty).getEleTy();
|
||||
mlir::Type elemTy;
|
||||
if (mlir::isa<fir::VectorType>(ty))
|
||||
elemTy = mlir::dyn_cast<fir::VectorType>(ty).getEleTy();
|
||||
else if (mlir::isa<mlir::VectorType>(ty))
|
||||
elemTy = mlir::dyn_cast<mlir::VectorType>(ty).getElementType();
|
||||
else
|
||||
return std::nullopt;
|
||||
|
||||
// fir.vector<4:ui32> is converted to mlir.vector<4xi32>
|
||||
if (elemTy.isUnsignedInteger()) {
|
||||
elemTy = mlir::IntegerType::get(
|
||||
ty.getContext(),
|
||||
mlir::dyn_cast<mlir::IntegerType>(elemTy).getWidth());
|
||||
}
|
||||
return elemTy;
|
||||
} else if (mlir::isa<mlir::VectorType>(ty))
|
||||
return mlir::dyn_cast<mlir::VectorType>(ty).getElementType();
|
||||
|
||||
return std::nullopt;
|
||||
// e.g. fir.vector<4:ui32> => mlir.vector<4xi32>
|
||||
// e.g. mlir.vector<4xui32> => mlir.vector<4xi32>
|
||||
if (elemTy.isUnsignedInteger()) {
|
||||
elemTy = mlir::IntegerType::get(
|
||||
ty.getContext(), mlir::dyn_cast<mlir::IntegerType>(elemTy).getWidth());
|
||||
}
|
||||
return elemTy;
|
||||
}
|
||||
|
||||
static std::optional<uint64_t> getVectorLen(mlir::Type ty) {
|
||||
|
||||
@@ -298,7 +298,8 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
|
||||
actualFirstSymbol && actualFirstSymbol->attrs().test(Attr::ASYNCHRONOUS)};
|
||||
bool actualIsVolatile{
|
||||
actualFirstSymbol && actualFirstSymbol->attrs().test(Attr::VOLATILE)};
|
||||
if (const auto *derived{evaluate::GetDerivedTypeSpec(actualType.type())}) {
|
||||
const auto *derived{evaluate::GetDerivedTypeSpec(actualType.type())};
|
||||
if (derived && !derived->IsVectorType()) {
|
||||
if (dummy.type.type().IsAssumedType()) {
|
||||
if (!derived->parameters().empty()) { // 15.5.2.4(2)
|
||||
messages.Say(
|
||||
|
||||
@@ -518,8 +518,11 @@ bool Semantics::Perform() {
|
||||
.statement.v.source == "__ppc_types")) {
|
||||
// Don't try to read the builtins module when we're actually building it.
|
||||
} else if (frontModule &&
|
||||
std::get<parser::Statement<parser::ModuleStmt>>(frontModule->value().t)
|
||||
.statement.v.source == "__ppc_intrinsics") {
|
||||
(std::get<parser::Statement<parser::ModuleStmt>>(frontModule->value().t)
|
||||
.statement.v.source == "__ppc_intrinsics" ||
|
||||
std::get<parser::Statement<parser::ModuleStmt>>(
|
||||
frontModule->value().t)
|
||||
.statement.v.source == "mma")) {
|
||||
// The derived type definition for the vectors is needed.
|
||||
context_.UsePPCBuiltinTypesModule();
|
||||
} else {
|
||||
|
||||
220
flang/module/mma.f90
Normal file
220
flang/module/mma.f90
Normal file
@@ -0,0 +1,220 @@
|
||||
!===-- module/mma.f90 ------------------------------------------------------===!
|
||||
!
|
||||
! Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
! See https://llvm.org/LICENSE.txt for license information.
|
||||
! SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
!
|
||||
!===------------------------------------------------------------------------===!
|
||||
|
||||
module mma
|
||||
implicit none
|
||||
private
|
||||
|
||||
abstract interface
|
||||
|
||||
!! ========== 3 arguments subroutine interface ===============================!!
|
||||
!! subroutine s(__vector_pair, vector(i), vector(i))
|
||||
#define ELEM_SUB_VPVIVI(VKIND) \
|
||||
elemental subroutine sub_vpvi##VKIND##vi##VKIND(pair, arg1, arg2); \
|
||||
__vector_pair, intent(out) :: pair ; \
|
||||
vector(integer(VKIND)), intent(in) :: arg1, arg2; \
|
||||
end subroutine ;
|
||||
|
||||
!! subroutine s(__vector_pair, vector(u), vector(u))
|
||||
#define ELEM_SUB_VPVUVU(VKIND) \
|
||||
elemental subroutine sub_vpvu##VKIND##vu##VKIND(pair, arg1, arg2); \
|
||||
__vector_pair, intent(out) :: pair ; \
|
||||
vector(unsigned(VKIND)), intent(in) :: arg1, arg2; \
|
||||
end subroutine ;
|
||||
|
||||
!! subroutine s(__vector_pair, vector(r), vector(r))
|
||||
#define ELEM_SUB_VPVRVR(VKIND) \
|
||||
elemental subroutine sub_vpvr##VKIND##vr##VKIND(pair, arg1, arg2); \
|
||||
__vector_pair, intent(out) :: pair ; \
|
||||
vector(real(VKIND)), intent(in) :: arg1, arg2; \
|
||||
end subroutine ;
|
||||
|
||||
ELEM_SUB_VPVIVI(1) ELEM_SUB_VPVIVI(2)
|
||||
ELEM_SUB_VPVIVI(4) ELEM_SUB_VPVIVI(8)
|
||||
ELEM_SUB_VPVUVU(1) ELEM_SUB_VPVUVU(2)
|
||||
ELEM_SUB_VPVUVU(4) ELEM_SUB_VPVUVU(8)
|
||||
ELEM_SUB_VPVRVR(4) ELEM_SUB_VPVRVR(8)
|
||||
|
||||
#undef ELEM_SUB_VPVIVI
|
||||
#undef ELEM_SUB_VPVUVU
|
||||
#undef ELEM_SUB_VPVRVR
|
||||
|
||||
!! ========== 5 arguments subroutine interface ===============================!!
|
||||
!! subroutine s(__vector_quad, vector(i), vector(i), vector(i), vector(i))
|
||||
#define ELEM_SUB_VQVIVIVIVI(VKIND) \
|
||||
elemental subroutine sub_vqvi##VKIND##vi##VKIND##vi##VKIND##vi##VKIND(acc, arg1, arg2, arg3, arg4); \
|
||||
__vector_quad, intent(out) :: acc; \
|
||||
vector(integer(VKIND)), intent(in) :: arg1, arg2, arg3, arg4; \
|
||||
end subroutine ;
|
||||
|
||||
!! subroutine s(__vector_quad, vector(u), vector(u), vector(u), vector(u))
|
||||
#define ELEM_SUB_VQVUVUVUVU(VKIND) \
|
||||
elemental subroutine sub_vqvu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND(acc, arg1, arg2, arg3, arg4); \
|
||||
__vector_quad, intent(out) :: acc; \
|
||||
vector(unsigned(VKIND)), intent(in) :: arg1, arg2, arg3, arg4; \
|
||||
end subroutine ;
|
||||
|
||||
!! subroutine s(__vector_quad, vector(r), vector(r), vector(r), vector(r))
|
||||
#define ELEM_SUB_VQVRVRVRVR(VKIND) \
|
||||
elemental subroutine sub_vqvr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND(acc, arg1, arg2, arg3, arg4); \
|
||||
__vector_quad, intent(out) :: acc; \
|
||||
vector(real(VKIND)), intent(in) :: arg1, arg2, arg3, arg4; \
|
||||
end subroutine ;
|
||||
|
||||
ELEM_SUB_VQVIVIVIVI(1) ELEM_SUB_VQVIVIVIVI(2)
|
||||
ELEM_SUB_VQVIVIVIVI(4) ELEM_SUB_VQVIVIVIVI(8)
|
||||
ELEM_SUB_VQVUVUVUVU(1) ELEM_SUB_VQVUVUVUVU(2)
|
||||
ELEM_SUB_VQVUVUVUVU(4) ELEM_SUB_VQVUVUVUVU(8)
|
||||
ELEM_SUB_VQVRVRVRVR(4) ELEM_SUB_VQVRVRVRVR(8)
|
||||
|
||||
#undef ELEM_SUB_VQVRVRVRVR
|
||||
#undef ELEM_SUB_VQVUVUVUVU
|
||||
#undef ELEM_SUB_VQVIVIVIVI
|
||||
|
||||
!! ========== non-macro interface =============================================!!
|
||||
elemental subroutine sub_atvp(data, pair)
|
||||
! Dummy arg 'data' is supposed to be intent(out) of any type,
|
||||
! but according to Fortran 2018: C709: Type(*) arguments can not have
|
||||
! intent(out) attribute. Use intent(inout) instead.
|
||||
type(*), intent(inout) :: data
|
||||
__vector_pair, intent(inout) :: pair
|
||||
end subroutine
|
||||
|
||||
elemental subroutine sub_atvq(data, acc)
|
||||
! Dummy arg 'data' is supposed to be intent(out) of any type,
|
||||
! but according to Fortran 2018: C709: Type(*) arguments can not have
|
||||
! intent(out) attribute. Use intent(inout) instead.
|
||||
type(*), intent(inout) :: data
|
||||
__vector_quad, intent(inout) :: acc
|
||||
end subroutine
|
||||
|
||||
end interface
|
||||
|
||||
|
||||
#define SUB_VQ_VI_VI_VI_VI(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND##vi##VKIND##vi##VKIND
|
||||
#define SUB_VQ_VU_VU_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND
|
||||
#define SUB_VQ_VR_VR_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND
|
||||
|
||||
#define VEC_SUB_VQ_VI_VI_VI_VI(NAME, VKIND) \
|
||||
procedure(sub_vqvi##VKIND##vi##VKIND##vi##VKIND##vi##VKIND) :: SUB_VQ_VI_VI_VI_VI(NAME, VKIND);
|
||||
#define VEC_SUB_VQ_VU_VU_VU_VU(NAME, VKIND) \
|
||||
procedure(sub_vqvu##VKIND##vu##VKIND##vu##VKIND##vu##VKIND) :: SUB_VQ_VU_VU_VU_VU(NAME, VKIND);
|
||||
#define VEC_SUB_VQ_VR_VR_VR_VR(NAME, VKIND) \
|
||||
procedure(sub_vqvr##VKIND##vr##VKIND##vr##VKIND##vr##VKIND) :: SUB_VQ_VR_VR_VR_VR(NAME, VKIND);
|
||||
|
||||
! mma_assemble_acc
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,1)
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,2)
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,4)
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,8)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,1)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,2)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,4)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,8)
|
||||
VEC_SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,4)
|
||||
VEC_SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,8)
|
||||
interface mma_assemble_acc
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,1)
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,2)
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,4)
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_assemble_acc,8)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,1)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,2)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,4)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_assemble_acc,8)
|
||||
procedure :: SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,4)
|
||||
procedure :: SUB_VQ_VR_VR_VR_VR(mma_assemble_acc,8)
|
||||
end interface
|
||||
public mma_assemble_acc
|
||||
|
||||
! mma_build_acc
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,1)
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,2)
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,4)
|
||||
VEC_SUB_VQ_VI_VI_VI_VI(mma_build_acc,8)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,1)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,2)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,4)
|
||||
VEC_SUB_VQ_VU_VU_VU_VU(mma_build_acc,8)
|
||||
VEC_SUB_VQ_VR_VR_VR_VR(mma_build_acc,4)
|
||||
VEC_SUB_VQ_VR_VR_VR_VR(mma_build_acc,8)
|
||||
interface mma_build_acc
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,1)
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,2)
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,4)
|
||||
procedure :: SUB_VQ_VI_VI_VI_VI(mma_build_acc,8)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,1)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,2)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,4)
|
||||
procedure :: SUB_VQ_VU_VU_VU_VU(mma_build_acc,8)
|
||||
procedure :: SUB_VQ_VR_VR_VR_VR(mma_build_acc,4)
|
||||
procedure :: SUB_VQ_VR_VR_VR_VR(mma_build_acc,8)
|
||||
end interface
|
||||
public mma_build_acc
|
||||
|
||||
#undef VEC_SUB_VQ_VR_VR_VR_VR
|
||||
#undef VEC_SUB_VQ_VU_VU_VU_VU
|
||||
#undef VEC_SUB_VQ_VI_VI_VI_VI
|
||||
#undef SUB_VQ_VR_VR_VR_VR
|
||||
#undef SUB_VQ_VU_VU_VU_VU
|
||||
#undef SUB_VQ_VI_VI_VI_VI
|
||||
|
||||
#define SUB_VP_VI_VI(NAME, VKIND) __ppc_##NAME##_vi##VKIND##vi##VKIND
|
||||
#define SUB_VP_VU_VU(NAME, VKIND) __ppc_##NAME##_vu##VKIND##vu##VKIND
|
||||
#define SUB_VP_VR_VR(NAME, VKIND) __ppc_##NAME##_vr##VKIND##vr##VKIND
|
||||
|
||||
#define VEC_SUB_VP_VI_VI(NAME, VKIND) \
|
||||
procedure(sub_vpvi##VKIND##vi##VKIND) :: SUB_VP_VI_VI(NAME, VKIND);
|
||||
#define VEC_SUB_VP_VU_VU(NAME, VKIND) \
|
||||
procedure(sub_vpvu##VKIND##vu##VKIND) :: SUB_VP_VU_VU(NAME, VKIND);
|
||||
#define VEC_SUB_VP_VR_VR(NAME, VKIND) \
|
||||
procedure(sub_vpvr##VKIND##vr##VKIND) :: SUB_VP_VR_VR(NAME, VKIND);
|
||||
|
||||
! mma_assemble_pair
|
||||
VEC_SUB_VP_VI_VI(mma_assemble_pair,1) VEC_SUB_VP_VI_VI(mma_assemble_pair,2)
|
||||
VEC_SUB_VP_VI_VI(mma_assemble_pair,4) VEC_SUB_VP_VI_VI(mma_assemble_pair,8)
|
||||
VEC_SUB_VP_VU_VU(mma_assemble_pair,1) VEC_SUB_VP_VU_VU(mma_assemble_pair,2)
|
||||
VEC_SUB_VP_VU_VU(mma_assemble_pair,4) VEC_SUB_VP_VU_VU(mma_assemble_pair,8)
|
||||
VEC_SUB_VP_VR_VR(mma_assemble_pair,4) VEC_SUB_VP_VR_VR(mma_assemble_pair,8)
|
||||
interface mma_assemble_pair
|
||||
procedure :: SUB_VP_VI_VI(mma_assemble_pair,1)
|
||||
procedure :: SUB_VP_VI_VI(mma_assemble_pair,2)
|
||||
procedure :: SUB_VP_VI_VI(mma_assemble_pair,4)
|
||||
procedure :: SUB_VP_VI_VI(mma_assemble_pair,8)
|
||||
procedure :: SUB_VP_VU_VU(mma_assemble_pair,1)
|
||||
procedure :: SUB_VP_VU_VU(mma_assemble_pair,2)
|
||||
procedure :: SUB_VP_VU_VU(mma_assemble_pair,4)
|
||||
procedure :: SUB_VP_VU_VU(mma_assemble_pair,8)
|
||||
procedure :: SUB_VP_VR_VR(mma_assemble_pair,4)
|
||||
procedure :: SUB_VP_VR_VR(mma_assemble_pair,8)
|
||||
end interface
|
||||
public mma_assemble_pair
|
||||
|
||||
#undef VEC_SUB_VP_VR_VR
|
||||
#undef VEC_SUB_VP_VU_VU
|
||||
#undef VEC_SUB_VP_VI_VI
|
||||
#undef SUB_VP_VR_VR
|
||||
#undef SUB_VP_VU_VU
|
||||
#undef SUB_VP_VI_VI
|
||||
|
||||
! mma_disassemble_acc
|
||||
procedure(sub_atvq) :: __ppc_mma_disassemble_acc
|
||||
interface mma_disassemble_acc
|
||||
procedure :: __ppc_mma_disassemble_acc
|
||||
end interface
|
||||
public mma_disassemble_acc
|
||||
|
||||
! mma_disassemble_pair
|
||||
procedure(sub_atvp) :: __ppc_mma_disassemble_pair
|
||||
interface mma_disassemble_pair
|
||||
procedure :: __ppc_mma_disassemble_pair
|
||||
end interface
|
||||
public mma_disassemble_pair
|
||||
|
||||
end module
|
||||
|
||||
716
flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
Normal file
716
flang/test/Lower/PowerPC/ppc-mma-assemble-disassemble.f90
Normal file
@@ -0,0 +1,716 @@
|
||||
! RUN: %flang --target=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -emit-llvm -S %s -o - | FileCheck --check-prefixes="CHECK" %s
|
||||
! REQUIRES: target=powerpc{{.*}}
|
||||
|
||||
! mma_assemble_acc
|
||||
|
||||
subroutine test_assemble_acc_i1()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(1)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_i1
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_i1
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %6 = load <16 x i8>, ptr %2, align 16
|
||||
! CHECK: %7 = load <16 x i8>, ptr %3, align 16
|
||||
! CHECK: %8 = load <16 x i8>, ptr %4, align 16
|
||||
! CHECK: %9 = load <16 x i8>, ptr %5, align 16
|
||||
! CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %6, <16 x i8> %7, <16 x i8> %8, <16 x i8> %9)
|
||||
! CHECK: store <512 x i1> %10, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_i2()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(2)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_i2
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_i2
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %6 = load <8 x i16>, ptr %2, align 16
|
||||
! CHECK: %7 = load <8 x i16>, ptr %3, align 16
|
||||
! CHECK: %8 = load <8 x i16>, ptr %4, align 16
|
||||
! CHECK: %9 = load <8 x i16>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <8 x i16> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <8 x i16> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <8 x i16> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <8 x i16> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
|
||||
subroutine test_assemble_acc_i4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(4)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_i4
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_i4
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %6 = load <4 x i32>, ptr %2, align 16
|
||||
! CHECK: %7 = load <4 x i32>, ptr %3, align 16
|
||||
! CHECK: %8 = load <4 x i32>, ptr %4, align 16
|
||||
! CHECK: %9 = load <4 x i32>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <4 x i32> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <4 x i32> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <4 x i32> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <4 x i32> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_i8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(8)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_i8
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_i8
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %6 = load <2 x i64>, ptr %2, align 16
|
||||
! CHECK: %7 = load <2 x i64>, ptr %3, align 16
|
||||
! CHECK: %8 = load <2 x i64>, ptr %4, align 16
|
||||
! CHECK: %9 = load <2 x i64>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <2 x i64> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <2 x i64> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <2 x i64> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <2 x i64> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
|
||||
subroutine test_assemble_acc_u1()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(1)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_u1
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_u1
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <16 x i8>, i64 1, align 16
|
||||
! CHECK: %6 = load <16 x i8>, ptr %2, align 16
|
||||
! CHECK: %7 = load <16 x i8>, ptr %3, align 16
|
||||
! CHECK: %8 = load <16 x i8>, ptr %4, align 16
|
||||
! CHECK: %9 = load <16 x i8>, ptr %5, align 16
|
||||
! CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %6, <16 x i8> %7, <16 x i8> %8, <16 x i8> %9)
|
||||
! CHECK: store <512 x i1> %10, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_u2()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(2)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_u2
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_u2
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <8 x i16>, i64 1, align 16
|
||||
! CHECK: %6 = load <8 x i16>, ptr %2, align 16
|
||||
! CHECK: %7 = load <8 x i16>, ptr %3, align 16
|
||||
! CHECK: %8 = load <8 x i16>, ptr %4, align 16
|
||||
! CHECK: %9 = load <8 x i16>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <8 x i16> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <8 x i16> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <8 x i16> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <8 x i16> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_u4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(4)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_u4
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_u4
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <4 x i32>, i64 1, align 16
|
||||
! CHECK: %6 = load <4 x i32>, ptr %2, align 16
|
||||
! CHECK: %7 = load <4 x i32>, ptr %3, align 16
|
||||
! CHECK: %8 = load <4 x i32>, ptr %4, align 16
|
||||
! CHECK: %9 = load <4 x i32>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <4 x i32> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <4 x i32> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <4 x i32> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <4 x i32> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_u8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(8)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_u8
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_u8
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <2 x i64>, i64 1, align 16
|
||||
! CHECK: %6 = load <2 x i64>, ptr %2, align 16
|
||||
! CHECK: %7 = load <2 x i64>, ptr %3, align 16
|
||||
! CHECK: %8 = load <2 x i64>, ptr %4, align 16
|
||||
! CHECK: %9 = load <2 x i64>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <2 x i64> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <2 x i64> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <2 x i64> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <2 x i64> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_r4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(real(4)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_r4
|
||||
|
||||
! CHECK-LABEL: @test_assemble_acc_r4
|
||||
! CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
! CHECK: %2 = alloca <4 x float>, i64 1, align 16
|
||||
! CHECK: %3 = alloca <4 x float>, i64 1, align 16
|
||||
! CHECK: %4 = alloca <4 x float>, i64 1, align 16
|
||||
! CHECK: %5 = alloca <4 x float>, i64 1, align 16
|
||||
! CHECK: %6 = load <4 x float>, ptr %2, align 16
|
||||
! CHECK: %7 = load <4 x float>, ptr %3, align 16
|
||||
! CHECK: %8 = load <4 x float>, ptr %4, align 16
|
||||
! CHECK: %9 = load <4 x float>, ptr %5, align 16
|
||||
! CHECK: %10 = bitcast <4 x float> %6 to <16 x i8>
|
||||
! CHECK: %11 = bitcast <4 x float> %7 to <16 x i8>
|
||||
! CHECK: %12 = bitcast <4 x float> %8 to <16 x i8>
|
||||
! CHECK: %13 = bitcast <4 x float> %9 to <16 x i8>
|
||||
! CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
! CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_assemble_acc_r8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(real(8)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_assemble_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_assemble_acc_r8
|
||||
|
||||
!CHECK-LABEL: @test_assemble_acc_r8
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %6 = load <2 x double>, ptr %2, align 16
|
||||
!CHECK: %7 = load <2 x double>, ptr %3, align 16
|
||||
!CHECK: %8 = load <2 x double>, ptr %4, align 16
|
||||
!CHECK: %9 = load <2 x double>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <2 x double> %6 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <2 x double> %7 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <2 x double> %8 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <2 x double> %9 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
! mma_assemble_pair
|
||||
|
||||
subroutine test_mma_assemble_pair_i1()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(1)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_i1
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_i1_
|
||||
!CHECK: %1 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <16 x i8>, ptr %1, align 16
|
||||
!CHECK: %5 = load <16 x i8>, ptr %2, align 16
|
||||
!CHECK: %6 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %4, <16 x i8> %5)
|
||||
!CHECK: store <256 x i1> %6, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_i2()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(2)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_i2
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_i2_
|
||||
!CHECK: %1 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <8 x i16>, ptr %1, align 16
|
||||
!CHECK: %5 = load <8 x i16>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_i4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(4)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_i4
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_i4_
|
||||
!CHECK: %1 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <4 x i32>, ptr %1, align 16
|
||||
!CHECK: %5 = load <4 x i32>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <4 x i32> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <4 x i32> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_i8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(8)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_i8
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_i8_
|
||||
!CHECK: %1 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <2 x i64>, ptr %1, align 16
|
||||
!CHECK: %5 = load <2 x i64>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <2 x i64> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <2 x i64> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_u1()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(1)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_u1
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_u1_
|
||||
!CHECK: %1 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <16 x i8>, ptr %1, align 16
|
||||
!CHECK: %5 = load <16 x i8>, ptr %2, align 16
|
||||
!CHECK: %6 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %4, <16 x i8> %5)
|
||||
!CHECK: store <256 x i1> %6, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_u2()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(2)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_u2
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_u2_
|
||||
!CHECK: %1 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <8 x i16>, ptr %1, align 16
|
||||
!CHECK: %5 = load <8 x i16>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <8 x i16> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <8 x i16> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_u4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(4)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_u4
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_u4_
|
||||
!CHECK: %1 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <4 x i32>, ptr %1, align 16
|
||||
!CHECK: %5 = load <4 x i32>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <4 x i32> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <4 x i32> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_u8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(8)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_u8
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_u8_
|
||||
!CHECK: %1 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <2 x i64>, ptr %1, align 16
|
||||
!CHECK: %5 = load <2 x i64>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <2 x i64> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <2 x i64> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_r4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(real(4)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_r4
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_r4_
|
||||
!CHECK: %1 = alloca <4 x float>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <4 x float>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <4 x float>, ptr %1, align 16
|
||||
!CHECK: %5 = load <4 x float>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <4 x float> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <4 x float> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
subroutine test_mma_assemble_pair_r8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(real(8)) vi10, vi11
|
||||
__vector_pair :: vp
|
||||
call mma_assemble_pair(vp, vi10, vi11)
|
||||
end subroutine test_mma_assemble_pair_r8
|
||||
|
||||
!CHECK: @test_mma_assemble_pair_r8_
|
||||
!CHECK: %1 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %2 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %4 = load <2 x double>, ptr %1, align 16
|
||||
!CHECK: %5 = load <2 x double>, ptr %2, align 16
|
||||
!CHECK: %6 = bitcast <2 x double> %4 to <16 x i8>
|
||||
!CHECK: %7 = bitcast <2 x double> %5 to <16 x i8>
|
||||
!CHECK: %8 = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> %6, <16 x i8> %7)
|
||||
!CHECK: store <256 x i1> %8, ptr %3, align 32
|
||||
|
||||
! mma_disassemble_acc
|
||||
|
||||
subroutine test_mma_build_acc_i1()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(1)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_i1
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_i1
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %6 = load <16 x i8>, ptr %2, align 16
|
||||
!CHECK: %7 = load <16 x i8>, ptr %3, align 16
|
||||
!CHECK: %8 = load <16 x i8>, ptr %4, align 16
|
||||
!CHECK: %9 = load <16 x i8>, ptr %5, align 16
|
||||
!CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %9, <16 x i8> %8, <16 x i8> %7, <16 x i8> %6)
|
||||
!CHECK: store <512 x i1> %10, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_i2()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(2)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_i2
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_i2
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %6 = load <8 x i16>, ptr %2, align 16
|
||||
!CHECK: %7 = load <8 x i16>, ptr %3, align 16
|
||||
!CHECK: %8 = load <8 x i16>, ptr %4, align 16
|
||||
!CHECK: %9 = load <8 x i16>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <8 x i16> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <8 x i16> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <8 x i16> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <8 x i16> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_i4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(4)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_i4
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_i4
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %6 = load <4 x i32>, ptr %2, align 16
|
||||
!CHECK: %7 = load <4 x i32>, ptr %3, align 16
|
||||
!CHECK: %8 = load <4 x i32>, ptr %4, align 16
|
||||
!CHECK: %9 = load <4 x i32>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <4 x i32> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <4 x i32> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <4 x i32> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <4 x i32> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_i8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(integer(8)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_i8
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_i8
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %6 = load <2 x i64>, ptr %2, align 16
|
||||
!CHECK: %7 = load <2 x i64>, ptr %3, align 16
|
||||
!CHECK: %8 = load <2 x i64>, ptr %4, align 16
|
||||
!CHECK: %9 = load <2 x i64>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <2 x i64> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <2 x i64> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <2 x i64> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <2 x i64> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_u1()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(1)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_u1
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_u1
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <16 x i8>, i64 1, align 16
|
||||
!CHECK: %6 = load <16 x i8>, ptr %2, align 16
|
||||
!CHECK: %7 = load <16 x i8>, ptr %3, align 16
|
||||
!CHECK: %8 = load <16 x i8>, ptr %4, align 16
|
||||
!CHECK: %9 = load <16 x i8>, ptr %5, align 16
|
||||
!CHECK: %10 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %9, <16 x i8> %8, <16 x i8> %7, <16 x i8> %6)
|
||||
!CHECK: store <512 x i1> %10, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_u2()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(2)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_u2
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_u2
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <8 x i16>, i64 1, align 16
|
||||
!CHECK: %6 = load <8 x i16>, ptr %2, align 16
|
||||
!CHECK: %7 = load <8 x i16>, ptr %3, align 16
|
||||
!CHECK: %8 = load <8 x i16>, ptr %4, align 16
|
||||
!CHECK: %9 = load <8 x i16>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <8 x i16> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <8 x i16> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <8 x i16> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <8 x i16> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_u4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(4)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_u4
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_u4
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <4 x i32>, i64 1, align 16
|
||||
!CHECK: %6 = load <4 x i32>, ptr %2, align 16
|
||||
!CHECK: %7 = load <4 x i32>, ptr %3, align 16
|
||||
!CHECK: %8 = load <4 x i32>, ptr %4, align 16
|
||||
!CHECK: %9 = load <4 x i32>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <4 x i32> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <4 x i32> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <4 x i32> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <4 x i32> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
subroutine test_mma_build_acc_u8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(unsigned(8)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_u8
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_u8
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <2 x i64>, i64 1, align 16
|
||||
!CHECK: %6 = load <2 x i64>, ptr %2, align 16
|
||||
!CHECK: %7 = load <2 x i64>, ptr %3, align 16
|
||||
!CHECK: %8 = load <2 x i64>, ptr %4, align 16
|
||||
!CHECK: %9 = load <2 x i64>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <2 x i64> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <2 x i64> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <2 x i64> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <2 x i64> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
|
||||
subroutine test_mma_build_acc_r4()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(real(4)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_r4
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_r4
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <4 x float>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <4 x float>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <4 x float>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <4 x float>, i64 1, align 16
|
||||
!CHECK: %6 = load <4 x float>, ptr %2, align 16
|
||||
!CHECK: %7 = load <4 x float>, ptr %3, align 16
|
||||
!CHECK: %8 = load <4 x float>, ptr %4, align 16
|
||||
!CHECK: %9 = load <4 x float>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <4 x float> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <4 x float> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <4 x float> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <4 x float> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
|
||||
subroutine test_mma_build_acc_r8()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
vector(real(8)) vi10, vi11, vi12, vi13
|
||||
__vector_quad :: cq
|
||||
call mma_build_acc(cq, vi10, vi11, vi12, vi13)
|
||||
end subroutine test_mma_build_acc_r8
|
||||
|
||||
!CHECK-LABEL: @test_mma_build_acc_r8
|
||||
!CHECK: %1 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %2 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %3 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %4 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %5 = alloca <2 x double>, i64 1, align 16
|
||||
!CHECK: %6 = load <2 x double>, ptr %2, align 16
|
||||
!CHECK: %7 = load <2 x double>, ptr %3, align 16
|
||||
!CHECK: %8 = load <2 x double>, ptr %4, align 16
|
||||
!CHECK: %9 = load <2 x double>, ptr %5, align 16
|
||||
!CHECK: %10 = bitcast <2 x double> %9 to <16 x i8>
|
||||
!CHECK: %11 = bitcast <2 x double> %8 to <16 x i8>
|
||||
!CHECK: %12 = bitcast <2 x double> %7 to <16 x i8>
|
||||
!CHECK: %13 = bitcast <2 x double> %6 to <16 x i8>
|
||||
!CHECK: %14 = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> %10, <16 x i8> %11, <16 x i8> %12, <16 x i8> %13)
|
||||
!CHECK: store <512 x i1> %14, ptr %1, align 64
|
||||
|
||||
! mma_disassemble_acc
|
||||
|
||||
subroutine test_disassemble_acc()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
__vector_quad :: vq
|
||||
real :: data
|
||||
call mma_disassemble_acc(data, vq)
|
||||
end subroutine
|
||||
|
||||
!CHECK-LABEL: @test_disassemble_acc_
|
||||
!CHECK: %1 = alloca float, i64 1, align 4
|
||||
!CHECK: %2 = alloca <512 x i1>, i64 1, align 64
|
||||
!CHECK: %3 = load <512 x i1>, ptr %2, align 64
|
||||
!CHECK: %4 = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> %3)
|
||||
!CHECK: store { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %4, ptr %1, align 16
|
||||
|
||||
! mma_disassemble_pair
|
||||
|
||||
subroutine test_disassemble_pair()
|
||||
use, intrinsic :: mma
|
||||
implicit none
|
||||
__vector_pair :: vp
|
||||
real :: data
|
||||
call mma_disassemble_pair(data, vp)
|
||||
end subroutine
|
||||
|
||||
!CHECK-LABEL: @test_disassemble_pair_
|
||||
!CHECK: %1 = alloca float, i64 1, align 4
|
||||
!CHECK: %2 = alloca <256 x i1>, i64 1, align 32
|
||||
!CHECK: %3 = load <256 x i1>, ptr %2, align 32
|
||||
!CHECK: %4 = call { <16 x i8>, <16 x i8> } @llvm.ppc.vsx.disassemble.pair(<256 x i1> %3)
|
||||
!CHECK: store { <16 x i8>, <16 x i8> } %4, ptr %1, align 16
|
||||
@@ -10,6 +10,7 @@ set(MODULES
|
||||
"__fortran_type_info"
|
||||
"__ppc_types"
|
||||
"__ppc_intrinsics"
|
||||
"mma"
|
||||
"__cuda_builtins"
|
||||
"ieee_arithmetic"
|
||||
"ieee_exceptions"
|
||||
@@ -32,7 +33,8 @@ if (NOT CMAKE_CROSSCOMPILING)
|
||||
set(depends "")
|
||||
elseif(${filename} STREQUAL "__ppc_types")
|
||||
set(depends "")
|
||||
elseif(${filename} STREQUAL "__ppc_intrinsics")
|
||||
elseif(${filename} STREQUAL "__ppc_intrinsics" OR
|
||||
${filename} STREQUAL "mma")
|
||||
set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__ppc_types.mod)
|
||||
else()
|
||||
set(depends ${FLANG_INTRINSIC_MODULES_DIR}/__fortran_builtins.mod)
|
||||
@@ -47,7 +49,8 @@ if (NOT CMAKE_CROSSCOMPILING)
|
||||
|
||||
# The module contains PPC vector types that needs the PPC target.
|
||||
set(opts "")
|
||||
if(${filename} STREQUAL "__ppc_intrinsics")
|
||||
if(${filename} STREQUAL "__ppc_intrinsics" OR
|
||||
${filename} STREQUAL "mma")
|
||||
set(opts "--target=ppc64le")
|
||||
endif()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user