This class used to serve a few useful purposes: * Allowed containing a null DictionaryAttr * Provided some simple mutable API around a DictionaryAttr The first of which is no longer an issue now that there is much better caching support for attributes in general, and a cache in the context for empty dictionaries. The second results in more trouble than it's worth because it mutates the internal dictionary on every action, leading to a potentially large number of dictionary copies. NamedAttrList is a much better alternative for the second use case, and should be modified as needed to better fit it's usage as a DictionaryAttrBuilder. Differential Revision: https://reviews.llvm.org/D93442
284 lines
11 KiB
C++
284 lines
11 KiB
C++
//===- AsyncRegionRewriter.cpp - Implementation of GPU async rewriters ----===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This file implements the GPU dialect pattern rewriters that make GPU op
|
|
// within a region execute asynchronously.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "PassDetail.h"
|
|
#include "mlir/Dialect/Async/IR/Async.h"
|
|
#include "mlir/Dialect/GPU/GPUDialect.h"
|
|
#include "mlir/Dialect/GPU/Passes.h"
|
|
#include "mlir/Dialect/GPU/Utils.h"
|
|
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
|
#include "mlir/IR/BlockAndValueMapping.h"
|
|
#include "mlir/IR/Builders.h"
|
|
#include "mlir/IR/PatternMatch.h"
|
|
#include "mlir/IR/SymbolTable.h"
|
|
#include "mlir/Support/LLVM.h"
|
|
#include "mlir/Transforms/RegionUtils.h"
|
|
#include "llvm/ADT/TypeSwitch.h"
|
|
|
|
using namespace mlir;
|
|
namespace {
|
|
class GpuAsyncRegionPass : public GpuAsyncRegionPassBase<GpuAsyncRegionPass> {
|
|
struct ThreadTokenCallback;
|
|
struct DeferWaitCallback;
|
|
void runOnFunction() override;
|
|
};
|
|
} // namespace
|
|
|
|
static bool isTerminator(Operation *op) { return !op->isKnownNonTerminator(); }
|
|
static bool hasSideEffects(Operation *op) {
|
|
return !MemoryEffectOpInterface::hasNoEffect(op);
|
|
}
|
|
|
|
// Region walk callback which makes GPU ops implementing the AsyncOpInterface
|
|
// execute asynchronously.
|
|
struct GpuAsyncRegionPass::ThreadTokenCallback {
|
|
ThreadTokenCallback(MLIRContext &context) : builder(&context) {}
|
|
|
|
// If `op` implements the AsyncOpInterface, insert a `gpu.wait async` to
|
|
// create a current token (unless it already exists), and 'thread' that token
|
|
// through the `op` so that it executes asynchronously.
|
|
//
|
|
// If `op` is a terminator or an op with side-effects, insert a `gpu.wait` to
|
|
// host-synchronize execution. A `!gpu.async.token` will therefore only be
|
|
// used inside of its block and GPU execution will always synchronize with
|
|
// the host at block boundaries.
|
|
WalkResult operator()(Operation *op) {
|
|
if (isa<gpu::LaunchOp>(op))
|
|
return op->emitOpError("replace with gpu.launch_func first");
|
|
if (isa<gpu::WaitOp>(op))
|
|
return op->emitOpError("unexpected pre-existing gpu.wait");
|
|
builder.setInsertionPoint(op);
|
|
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(op))
|
|
return rewriteAsyncOp(asyncOp); // Replace GPU op with async version.
|
|
if (!currentToken)
|
|
return success();
|
|
// Insert host synchronization before terminator or op with side effects.
|
|
if (isTerminator(op) || hasSideEffects(op))
|
|
currentToken = createWaitOp(op->getLoc(), Type(), {currentToken});
|
|
return success();
|
|
}
|
|
|
|
private:
|
|
// Replaces asyncOp with a clone that returns a token.
|
|
LogicalResult rewriteAsyncOp(gpu::AsyncOpInterface asyncOp) {
|
|
auto *op = asyncOp.getOperation();
|
|
if (asyncOp.getAsyncToken())
|
|
// TODO: Support ops that are already async.
|
|
return op->emitOpError("is already async");
|
|
if (op->getNumRegions() > 0)
|
|
return op->emitOpError("regions are not supported");
|
|
|
|
// If there is no current token, insert a `gpu.wait async` without
|
|
// dependencies to create one.
|
|
if (!currentToken)
|
|
currentToken = createWaitOp(op->getLoc(), tokenType, {});
|
|
asyncOp.addAsyncDependency(currentToken);
|
|
|
|
// Clone the op to return a token in addition to the other results.
|
|
SmallVector<Type, 1> resultTypes;
|
|
resultTypes.reserve(1 + op->getNumResults());
|
|
copy(op->getResultTypes(), std::back_inserter(resultTypes));
|
|
resultTypes.push_back(tokenType);
|
|
auto *newOp = Operation::create(op->getLoc(), op->getName(), resultTypes,
|
|
op->getOperands(), op->getAttrDictionary(),
|
|
op->getSuccessors());
|
|
|
|
// Replace the op with the async clone.
|
|
auto results = newOp->getResults();
|
|
currentToken = results.back();
|
|
builder.insert(newOp);
|
|
op->replaceAllUsesWith(results.drop_back());
|
|
op->erase();
|
|
|
|
return success();
|
|
}
|
|
|
|
Value createWaitOp(Location loc, Type resultType, ValueRange operands) {
|
|
return builder.create<gpu::WaitOp>(loc, resultType, operands).asyncToken();
|
|
}
|
|
|
|
OpBuilder builder;
|
|
const Type tokenType = builder.getType<gpu::AsyncTokenType>();
|
|
// The token that represents the current asynchronous dependency. It's valid
|
|
// range starts with a `gpu.wait async` op, and ends with a `gpu.wait` op.
|
|
// In between, each gpu::AsyncOpInterface depends on the current token and
|
|
// produces the new one.
|
|
Value currentToken = {};
|
|
};
|
|
|
|
// Callback for `async.execute` ops which tries to push the contained
|
|
// synchronous `gpu.wait` op to the dependencies of the `async.execute`.
|
|
struct GpuAsyncRegionPass::DeferWaitCallback {
|
|
// If the `executeOp`s token is used only in `async.execute` or `async.await`
|
|
// ops, add the region's last `gpu.wait` op to the worklist if it is
|
|
// synchronous and is the last op with side effects.
|
|
void operator()(async::ExecuteOp executeOp) {
|
|
if (!areAllUsersExecuteOrAwait(executeOp.token()))
|
|
return;
|
|
// async.execute's region is currently restricted to one block.
|
|
for (auto &op : llvm::reverse(executeOp.getBody()->without_terminator())) {
|
|
if (auto waitOp = dyn_cast<gpu::WaitOp>(op)) {
|
|
if (!waitOp.asyncToken())
|
|
worklist.push_back(waitOp);
|
|
return;
|
|
}
|
|
if (hasSideEffects(&op))
|
|
return;
|
|
}
|
|
}
|
|
|
|
// The destructor performs the actual rewrite work.
|
|
~DeferWaitCallback() {
|
|
for (size_t i = 0; i < worklist.size(); ++i) {
|
|
auto waitOp = worklist[i];
|
|
auto executeOp = waitOp->getParentOfType<async::ExecuteOp>();
|
|
auto numDependencies = waitOp.asyncDependencies().size();
|
|
|
|
// Erase `gpu.wait` and return async dependencies from region instead.
|
|
auto &yieldOp = executeOp.getBody()->getOperations().back();
|
|
yieldOp.insertOperands(yieldOp.getNumOperands(),
|
|
waitOp.asyncDependencies());
|
|
waitOp.erase();
|
|
auto asyncTokens = addAsyncTokenResults(executeOp, numDependencies);
|
|
|
|
// Add the async dependency to each user of the `async.execute` token.
|
|
for (Operation *user : executeOp.token().getUsers())
|
|
addAsyncDependencyAfter(asyncTokens, user);
|
|
}
|
|
}
|
|
|
|
private:
|
|
// Append `count` `!async.value<!gpu.async.token>` results to `executeOp`.
|
|
static ValueRange addAsyncTokenResults(async::ExecuteOp &executeOp,
|
|
unsigned count) {
|
|
auto numResults = executeOp.getNumResults() + count;
|
|
|
|
// Construct new result type list with `count` additional types.
|
|
SmallVector<Type, 2> resultTypes;
|
|
resultTypes.reserve(numResults);
|
|
transform(executeOp.getResultTypes(), std::back_inserter(resultTypes),
|
|
[](Type type) {
|
|
// Extract value type from !async.value.
|
|
if (auto valueType = type.dyn_cast<async::ValueType>())
|
|
return valueType.getValueType();
|
|
assert(type.isa<async::TokenType>() && "expected token type");
|
|
return type;
|
|
});
|
|
OpBuilder builder(executeOp);
|
|
auto tokenType = builder.getType<gpu::AsyncTokenType>();
|
|
resultTypes.resize(numResults, tokenType);
|
|
|
|
// Clone executeOp with the extra `!gpu.async.token` results.
|
|
auto newOp = builder.create<async::ExecuteOp>(
|
|
executeOp.getLoc(), TypeRange{resultTypes}.drop_front() /*drop token*/,
|
|
executeOp.dependencies(), executeOp.operands());
|
|
BlockAndValueMapping mapper;
|
|
newOp.getRegion().getBlocks().clear();
|
|
executeOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
|
|
|
|
// Replace executeOp with cloned one.
|
|
executeOp.getOperation()->replaceAllUsesWith(
|
|
newOp.getResults().drop_back(count));
|
|
executeOp.erase();
|
|
executeOp = newOp;
|
|
|
|
// Return the new result values.
|
|
return executeOp.getResults().take_back(count);
|
|
}
|
|
|
|
// Returns whether all token users are either 'async.execute' or 'async.await'
|
|
// ops. This is used as a requirement for pushing 'gpu.wait' ops from a
|
|
// 'async.execute' body to it's users. Specifically, we do not allow
|
|
// terminator users, because it could mean that the `async.execute` is inside
|
|
// control flow code.
|
|
static bool areAllUsersExecuteOrAwait(Value token) {
|
|
return llvm::all_of(token.getUsers(), [](Operation *user) {
|
|
return isa<async::ExecuteOp, async::AwaitOp>(user);
|
|
});
|
|
}
|
|
|
|
// Add the `asyncToken` as dependency as needed after `op`.
|
|
void addAsyncDependencyAfter(ValueRange asyncTokens, Operation *op) {
|
|
OpBuilder builder(op->getContext());
|
|
auto loc = op->getLoc();
|
|
|
|
Block::iterator it;
|
|
SmallVector<Value, 1> tokens;
|
|
tokens.reserve(asyncTokens.size());
|
|
TypeSwitch<Operation *>(op)
|
|
.Case<async::AwaitOp>([&](auto awaitOp) {
|
|
// Add async.await ops to wait for the !gpu.async.tokens.
|
|
builder.setInsertionPointAfter(op);
|
|
for (auto asyncToken : asyncTokens)
|
|
tokens.push_back(
|
|
builder.create<async::AwaitOp>(loc, asyncToken).result());
|
|
// Set `it` after the inserted async.await ops.
|
|
it = builder.getInsertionPoint();
|
|
})
|
|
.Case<async::ExecuteOp>([&](auto executeOp) {
|
|
// Set `it` to the beginning of the region and add asyncTokens to the
|
|
// async.execute operands.
|
|
it = executeOp.getBody()->begin();
|
|
executeOp.operandsMutable().append(asyncTokens);
|
|
SmallVector<Type, 1> tokenTypes(
|
|
asyncTokens.size(), builder.getType<gpu::AsyncTokenType>());
|
|
copy(executeOp.getBody()->addArguments(tokenTypes),
|
|
std::back_inserter(tokens));
|
|
});
|
|
|
|
// Advance `it` to terminator or op with side-effects.
|
|
it = std::find_if(it, Block::iterator(), [](Operation &op) {
|
|
return isTerminator(&op) || hasSideEffects(&op);
|
|
});
|
|
|
|
// If `op` implements the AsyncOpInterface, add `token` to the list of async
|
|
// dependencies.
|
|
if (auto asyncOp = dyn_cast<gpu::AsyncOpInterface>(*it)) {
|
|
for (auto token : tokens)
|
|
asyncOp.addAsyncDependency(token);
|
|
return;
|
|
}
|
|
|
|
// Otherwise, insert a gpu.wait before 'it'.
|
|
builder.setInsertionPoint(it->getBlock(), it);
|
|
auto waitOp = builder.create<gpu::WaitOp>(loc, Type{}, tokens);
|
|
|
|
// If the new waitOp is at the end of an async.execute region, add it to the
|
|
// worklist. 'operator()(executeOp)' would do the same, but this is faster.
|
|
auto executeOp = dyn_cast<async::ExecuteOp>(it->getParentOp());
|
|
if (executeOp && areAllUsersExecuteOrAwait(executeOp.token()) &&
|
|
!it->getNextNode())
|
|
worklist.push_back(waitOp);
|
|
}
|
|
|
|
SmallVector<gpu::WaitOp, 8> worklist;
|
|
};
|
|
|
|
// Replaces synchronous GPU ops in the op's region with asynchronous ones and
|
|
// inserts the necessary synchronization (as gpu.wait ops). Assumes sequential
|
|
// execution semantics and that no GPU ops are asynchronous yet.
|
|
void GpuAsyncRegionPass::runOnFunction() {
|
|
if (getFunction()
|
|
.getRegion()
|
|
.walk(ThreadTokenCallback(getContext()))
|
|
.wasInterrupted())
|
|
return signalPassFailure();
|
|
|
|
// Collect gpu.wait ops that we can move out of async.execute regions.
|
|
getFunction().getRegion().walk(DeferWaitCallback());
|
|
}
|
|
|
|
std::unique_ptr<OperationPass<FuncOp>> mlir::createGpuAsyncRegionPass() {
|
|
return std::make_unique<GpuAsyncRegionPass>();
|
|
}
|