[flang][fir] always use memcpy for fir.box (#113949)
@jeanPerier explained the importance of converting box loads and stores into `memcpy`s instead of aggregate loads and stores, and I'll do my best to explain it here. * [(godbolt link) Example comparing opt transformations on memcpys vs aggregate load/stores](https://godbolt.org/z/be7xM83cG) * LLVM can more effectively reason about memcpys compared to aggregate load/stores. * This came up when others were discussing array descriptors for assumed-rank arrays passed to `bind(c)` subroutines, with the implication that the array descriptors are known to have lower bounds of 1 and that they are not pointer/allocatable types. * [(godbolt link) Clang also uses memcpys so we should probably follow them, assuming the clang developers are generatign what they know Opt will handle more effectively.](https://godbolt.org/z/YT4x7387W) * This currently may not help much without the `nocapture` attribute being propagated to function calls, but [it looks like someone may do this soon (discourse link)](https://discourse.llvm.org/t/applying-the-nocapture-attribute-to-reference-passed-arguments-in-fortran-subroutines/81401/23) or I can do this in a follow-up patch. Note on test `flang/test/Fir/embox-char.fir`: it looks like the original test was auto-generated. I wasn't too sure which parts were especially important to test, so I regenerated the test. If we want the updated version to look more like the old version, I'll make those changes.
This commit is contained in:
@@ -2949,9 +2949,10 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
|
||||
llvm::LogicalResult
|
||||
matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor,
|
||||
mlir::ConversionPatternRewriter &rewriter) const override {
|
||||
|
||||
mlir::Type llvmLoadTy = convertObjectType(load.getType());
|
||||
if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(load.getType())) {
|
||||
// fir.box is a special case because it is considered as an ssa values in
|
||||
// fir.box is a special case because it is considered an ssa value in
|
||||
// fir, but it is lowered as a pointer to a descriptor. So
|
||||
// fir.ref<fir.box> and fir.box end up being the same llvm types and
|
||||
// loading a fir.ref<fir.box> is implemented as taking a snapshot of the
|
||||
@@ -2960,30 +2961,17 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
|
||||
mlir::Location loc = load.getLoc();
|
||||
auto newBoxStorage =
|
||||
genAllocaAndAddrCastWithType(loc, llvmLoadTy, defaultAlign, rewriter);
|
||||
// TODO: always generate llvm.memcpy, LLVM is better at optimizing it than
|
||||
// aggregate loads + stores.
|
||||
if (boxTy.isAssumedRank()) {
|
||||
|
||||
TypePair boxTypePair{boxTy, llvmLoadTy};
|
||||
mlir::Value boxSize =
|
||||
computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter);
|
||||
auto memcpy = rewriter.create<mlir::LLVM::MemcpyOp>(
|
||||
loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false);
|
||||
if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
|
||||
memcpy.setTBAATags(*optionalTag);
|
||||
else
|
||||
attachTBAATag(memcpy, boxTy, boxTy, nullptr);
|
||||
} else {
|
||||
auto boxValue = rewriter.create<mlir::LLVM::LoadOp>(loc, llvmLoadTy,
|
||||
inputBoxStorage);
|
||||
if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
|
||||
boxValue.setTBAATags(*optionalTag);
|
||||
else
|
||||
attachTBAATag(boxValue, boxTy, boxTy, nullptr);
|
||||
auto storeOp =
|
||||
rewriter.create<mlir::LLVM::StoreOp>(loc, boxValue, newBoxStorage);
|
||||
attachTBAATag(storeOp, boxTy, boxTy, nullptr);
|
||||
}
|
||||
TypePair boxTypePair{boxTy, llvmLoadTy};
|
||||
mlir::Value boxSize =
|
||||
computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter);
|
||||
auto memcpy = rewriter.create<mlir::LLVM::MemcpyOp>(
|
||||
loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false);
|
||||
|
||||
if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
|
||||
memcpy.setTBAATags(*optionalTag);
|
||||
else
|
||||
attachTBAATag(memcpy, boxTy, boxTy, nullptr);
|
||||
rewriter.replaceOp(load, newBoxStorage);
|
||||
} else {
|
||||
auto loadOp = rewriter.create<mlir::LLVM::LoadOp>(
|
||||
@@ -3227,20 +3215,13 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
|
||||
mlir::LLVM::AliasAnalysisOpInterface newOp;
|
||||
if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(storeTy)) {
|
||||
mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy);
|
||||
// fir.box value is actually in memory, load it first before storing it,
|
||||
// or do a memcopy for assumed-rank descriptors.
|
||||
if (boxTy.isAssumedRank()) {
|
||||
TypePair boxTypePair{boxTy, llvmBoxTy};
|
||||
mlir::Value boxSize =
|
||||
computeBoxSize(loc, boxTypePair, llvmValue, rewriter);
|
||||
newOp = rewriter.create<mlir::LLVM::MemcpyOp>(
|
||||
loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false);
|
||||
} else {
|
||||
auto val =
|
||||
rewriter.create<mlir::LLVM::LoadOp>(loc, llvmBoxTy, llvmValue);
|
||||
attachTBAATag(val, boxTy, boxTy, nullptr);
|
||||
newOp = rewriter.create<mlir::LLVM::StoreOp>(loc, val, llvmMemref);
|
||||
}
|
||||
// Always use memcpy because LLVM is not as effective at optimizing
|
||||
// aggregate loads/stores as it is optimizing memcpy.
|
||||
TypePair boxTypePair{boxTy, llvmBoxTy};
|
||||
mlir::Value boxSize =
|
||||
computeBoxSize(loc, boxTypePair, llvmValue, rewriter);
|
||||
newOp = rewriter.create<mlir::LLVM::MemcpyOp>(
|
||||
loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false);
|
||||
} else {
|
||||
newOp = rewriter.create<mlir::LLVM::StoreOp>(loc, llvmValue, llvmMemref);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user