Files
clang-p2996/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp
Nicolas Vasilache 44e6318cea [mlir][transforms] Revamp the implementation of mapping loops to GPUs
This revision significantly simplifies the specification and implementation of mapping loops to GPU ids.

Each type of mapping (block, warpgroup, warp, thread) now comes with 2 mapping modes:
  1. a 3-D "grid-like" mode, subject to alignment considerations on threadIdx.x, on which predication
     may occur on a per-dimension 3-D sub-rectangle basis.
  2. a n-D linearized mode, on which predication may only occur on a linear basis.

In the process, better size and alignment requirement inference are introduced along with improved runtime verification messages.

The `warp_dims` attribute was deemed confusing and is removed from the transform in favor of better size inference.

Differential Revision: https://reviews.llvm.org/D155941
2023-07-26 00:09:08 +02:00

268 lines
11 KiB
C++

//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Support/MathExtras.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#include <cmath>
#include <numeric>
using namespace mlir;
#define DEBUG_TYPE "linalg-transforms"
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
static Attribute linearId0(MLIRContext *ctx) {
return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim0);
}
static Attribute linearId1(MLIRContext *ctx) {
return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim1);
}
static Attribute linearId2(MLIRContext *ctx) {
return gpu::GPUThreadMappingAttr::get(ctx, gpu::MappingId::LinearDim2);
}
transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
int totalNumThreads,
int64_t desiredBitAlignment,
ArrayRef<int64_t> copySizes,
bool favorPredication,
int64_t elementalBitwidth) {
assert(!copySizes.empty() && copySizes.size() <= 3 &&
"only 1,2,3-D copies are supported for now");
LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
llvm::dbgs() << "\n";);
// Greedily find the largest vector size that can be used to copy the most
// minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
// contiguous memory transactions with as few threads as possible.
int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
desiredBitAlignment, copySizes.back(), elementalBitwidth);
LDBG("--greedily determined vectorSize: "
<< desiredVectorSize << " elements of " << elementalBitwidth
<< "b each -> " << (desiredVectorSize * elementalBitwidth)
<< "b total out of a max of " << kMaxVectorLoadBitWidth << "b");
status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
favorPredication);
if (status == Status::Invalid)
return;
LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
llvm::dbgs() << "\n"; llvm::interleaveComma(
this->numThreads, DBGS() << "--numThreads: ");
llvm::dbgs() << "\n";);
LDBG("--vectorSize: " << this->vectorSize);
assert(this->numThreads.size() == copySizes.size() &&
"compute copy mapping expected same number of threads and copy sizes");
// Compute the smallest bounding box.
this->smallestBoundingTileSizes = llvm::to_vector(
llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
int64_t size, numThreads;
std::tie(size, numThreads) = pair;
return mlir::ceilDiv(size, numThreads);
}));
SmallVector<Attribute> allThreadMappings{linearId2(ctx), linearId1(ctx),
linearId0(ctx)};
// Set the thread mapping.
this->threadMapping =
llvm::to_vector(ArrayRef(allThreadMappings)
.take_back(this->smallestBoundingTileSizes.size()));
LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
}
int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
int64_t desiredBitAlignment, int64_t numContiguousElements,
int64_t elementalBitwidth) {
assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
"elemental bitwidth does not divide kMaxVectorLoadBitWidth");
assert(desiredBitAlignment % elementalBitwidth == 0 &&
"elemental bitwidth does not divide desired bit alignment");
return std::gcd(
std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
kMaxVectorLoadBitWidth / elementalBitwidth);
}
/// Get the list of all factors that divide `val`, not just the prime factors.
static SmallVector<int64_t> getFactors(int64_t val) {
SmallVector<int64_t> factors;
factors.reserve(val);
for (int64_t factor = 1; factor <= val; ++factor) {
if (val % factor != 0)
continue;
factors.push_back(factor);
}
factors.push_back(val);
return factors;
}
static int64_t product(ArrayRef<int64_t> vals) {
int64_t res = 1;
for (auto val : vals)
res *= val;
return res;
}
/// Extract `result` from `sizes` with the following constraints:
/// 1. sizes[i] % result[i] for all i
/// 2. product_of_threadsPerDim <= maxNumThreads
/// 3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
/// must be sizes[currentIndex].
/// This is used to greedily extract the maximum number of threads usable for
/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
/// ensuring coalesced access along the most minor dimension.
/// Return the number of threads used in the range:
/// threadsPerDim[currentIndex .. sizes.end()]
// The implementation uses a dynamic programming approach to greedily extract
// the best combination under the constraints.
// TODO: Implementation details can be improved but putting effort there is a
// tradeoffs: `sizes` is expected to be of small rank and contain small values.
static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
int64_t currentIndex,
int64_t maxNumThreads) {
assert(static_cast<size_t>(currentIndex) < sizes.size() &&
"currentIndex out of bounds");
std::string indent(2 * currentIndex, '-');
if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
return SmallVector<int64_t>{sizes[currentIndex]};
}
int64_t best = 0;
int64_t s = sizes[currentIndex];
SmallVector<int64_t> factors = getFactors(s);
SmallVector<int64_t> localThreadsPerDim;
localThreadsPerDim.reserve(sizes.size());
LDBG(indent << "maximizeNumThreads in " << s
<< " with limit: " << maxNumThreads);
for (auto factor : factors) {
auto nestedThreadsPerDim =
maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
int64_t localBest = factor * product(nestedThreadsPerDim);
if (localBest > best && localBest <= maxNumThreads) {
LDBG(indent << "new localBest: " << localBest);
LLVM_DEBUG(
llvm::interleaveComma(nestedThreadsPerDim,
DBGS() << indent << "nestedThreadsPerDim: ");
llvm::dbgs() << "\n";);
localThreadsPerDim.clear();
localThreadsPerDim.push_back(factor);
llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
best = localBest;
}
}
LDBG(indent << "found globalBest: " << best);
LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
DBGS() << indent << "numThreads: ");
llvm::dbgs() << "\n";);
return localThreadsPerDim;
}
transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
ArrayRef<int64_t> sizes,
int64_t desiredVectorSize,
bool favorPredication) {
if (!favorPredication) {
int64_t localVectorSize = desiredVectorSize;
for (; localVectorSize >= 1; localVectorSize /= 2) {
// Attempt to map the copy with predication and current fixed vector size:
// 1. if the status is Success, we are done.
// 2. if the status is Invalid, we fail immediately, no amount of
// vector size reduction can offset the bad tile size selection from the
// higher-level.
// 3. if the status is RequiresPredication, we try again with a smaller
// vector size.
Status status =
inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
if (status == Status::Success || status == Status::Invalid)
return status;
LDBG("requires predication, try reducing vector size to "
<< (localVectorSize / 2));
}
}
// If we have not yet returned, it means that we have tried all vector sizes
// and we still require predication. Restart from the original vector size and
// do not attempt to
return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
}
transform::gpu::CopyMappingInfo::Status
transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
int64_t totalNumThreads, ArrayRef<int64_t> sizes,
int64_t desiredVectorSize) {
assert(sizes.back() % desiredVectorSize == 0 &&
"most-minor size not divisible by actualVectorSize");
LDBG("inferNumThreadsImpl with totalNumThreads: "
<< totalNumThreads << " and vectorSize: " << desiredVectorSize);
// Scale the most minor size to account for the chosen vector size and
// maximize the number of threads without exceeding the total number of
// threads.
SmallVector<int64_t> scaledSizes{sizes};
scaledSizes.back() /= desiredVectorSize;
if (scaledSizes.back() > totalNumThreads) {
LDBG("--Too few threads given the required vector size -> FAIL");
return Status::Invalid;
}
SmallVector<int64_t> inferredNumThreads =
maximizeNumThreads(scaledSizes, 0, totalNumThreads);
LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
DBGS() << "inferred numThreads: ");
llvm::dbgs() << "\n";
LDBG("computed actualVectorSize: " << desiredVectorSize););
// Corner case: we cannot use more threads than available. If the dimension of
// the copy is so bad it is because higher-level tiling did not do its job, we
// do not try to recover from it here.
int64_t totalNumThreadsUsed = product(inferredNumThreads);
LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
LDBG("--Too few threads given the required vector size -> FAIL");
return Status::Invalid;
}
this->vectorSize = desiredVectorSize;
this->numThreads = inferredNumThreads;
if (totalNumThreadsUsed == totalNumThreads)
return Status::Success;
return Status::RequiresPredication;
}
void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
os << "MappingInfo{";
os << "CopyMappingInfo: ";
os << "valid: " << (status != Status::Invalid) << ", ";
os << "vectorSize: " << vectorSize << ", ";
llvm::interleaveComma(numThreads, os << ", numThreads: {");
llvm::interleaveComma(smallestBoundingTileSizes,
os << "}, smallestBoundingTileSizes: {");
llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
os << "}}";
}