[flang][fir] always use memcpy for fir.box (#113949)

@jeanPerier explained the importance of converting box loads and stores into `memcpy`s instead of aggregate loads and stores, and I'll do my best to explain it here. * [(godbolt link) Example comparing opt transformations on memcpys vs aggregate load/stores](https://godbolt.org/z/be7xM83cG) * LLVM can more effectively reason about memcpys compared to aggregate load/stores. * This came up when others were discussing array descriptors for assumed-rank arrays passed to `bind(c)` subroutines, with the implication that the array descriptors are known to have lower bounds of 1 and that they are not pointer/allocatable types. * [(godbolt link) Clang also uses memcpys so we should probably follow them, assuming the clang developers are generatign what they know Opt will handle more effectively.](https://godbolt.org/z/YT4x7387W) * This currently may not help much without the `nocapture` attribute being propagated to function calls, but [it looks like someone may do this soon (discourse link)](https://discourse.llvm.org/t/applying-the-nocapture-attribute-to-reference-passed-arguments-in-fortran-subroutines/81401/23) or I can do this in a follow-up patch. Note on test `flang/test/Fir/embox-char.fir`: it looks like the original test was auto-generated. I wasn't too sure which parts were especially important to test, so I regenerated the test. If we want the updated version to look more like the old version, I'll make those changes.
2024-10-30 09:50:27 -07:00
parent 49277253f0
commit 0c9a02355a
11 changed files with 187 additions and 204 deletions
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -2949,9 +2949,10 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
  llvm::LogicalResult
  matchAndRewrite(fir::LoadOp load, OpAdaptor adaptor,
                  mlir::ConversionPatternRewriter &rewriter) const override {
+
    mlir::Type llvmLoadTy = convertObjectType(load.getType());
    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(load.getType())) {
-      // fir.box is a special case because it is considered as an ssa values in
+      // fir.box is a special case because it is considered an ssa value in
      // fir, but it is lowered as a pointer to a descriptor. So
      // fir.ref<fir.box> and fir.box end up being the same llvm types and
      // loading a fir.ref<fir.box> is implemented as taking a snapshot of the
@@ -2960,30 +2961,17 @@ struct LoadOpConversion : public fir::FIROpConversion<fir::LoadOp> {
      mlir::Location loc = load.getLoc();
      auto newBoxStorage =
          genAllocaAndAddrCastWithType(loc, llvmLoadTy, defaultAlign, rewriter);
-      // TODO: always generate llvm.memcpy, LLVM is better at optimizing it than
-      // aggregate loads + stores.
-      if (boxTy.isAssumedRank()) {

-        TypePair boxTypePair{boxTy, llvmLoadTy};
-        mlir::Value boxSize =
-            computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter);
-        auto memcpy = rewriter.create<mlir::LLVM::MemcpyOp>(
-            loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false);
-        if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
-          memcpy.setTBAATags(*optionalTag);
-        else
-          attachTBAATag(memcpy, boxTy, boxTy, nullptr);
-      } else {
-        auto boxValue = rewriter.create<mlir::LLVM::LoadOp>(loc, llvmLoadTy,
-                                                            inputBoxStorage);
-        if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
-          boxValue.setTBAATags(*optionalTag);
-        else
-          attachTBAATag(boxValue, boxTy, boxTy, nullptr);
-        auto storeOp =
-            rewriter.create<mlir::LLVM::StoreOp>(loc, boxValue, newBoxStorage);
-        attachTBAATag(storeOp, boxTy, boxTy, nullptr);
-      }
+      TypePair boxTypePair{boxTy, llvmLoadTy};
+      mlir::Value boxSize =
+          computeBoxSize(loc, boxTypePair, inputBoxStorage, rewriter);
+      auto memcpy = rewriter.create<mlir::LLVM::MemcpyOp>(
+          loc, newBoxStorage, inputBoxStorage, boxSize, /*isVolatile=*/false);
+
+      if (std::optional<mlir::ArrayAttr> optionalTag = load.getTbaa())
+        memcpy.setTBAATags(*optionalTag);
+      else
+        attachTBAATag(memcpy, boxTy, boxTy, nullptr);
      rewriter.replaceOp(load, newBoxStorage);
    } else {
      auto loadOp = rewriter.create<mlir::LLVM::LoadOp>(
@@ -3227,20 +3215,13 @@ struct StoreOpConversion : public fir::FIROpConversion<fir::StoreOp> {
    mlir::LLVM::AliasAnalysisOpInterface newOp;
    if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(storeTy)) {
      mlir::Type llvmBoxTy = lowerTy().convertBoxTypeAsStruct(boxTy);
-      // fir.box value is actually in memory, load it first before storing it,
-      // or do a memcopy for assumed-rank descriptors.
-      if (boxTy.isAssumedRank()) {
-        TypePair boxTypePair{boxTy, llvmBoxTy};
-        mlir::Value boxSize =
-            computeBoxSize(loc, boxTypePair, llvmValue, rewriter);
-        newOp = rewriter.create<mlir::LLVM::MemcpyOp>(
-            loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false);
-      } else {
-        auto val =
-            rewriter.create<mlir::LLVM::LoadOp>(loc, llvmBoxTy, llvmValue);
-        attachTBAATag(val, boxTy, boxTy, nullptr);
-        newOp = rewriter.create<mlir::LLVM::StoreOp>(loc, val, llvmMemref);
-      }
+      // Always use memcpy because LLVM is not as effective at optimizing
+      // aggregate loads/stores as it is optimizing memcpy.
+      TypePair boxTypePair{boxTy, llvmBoxTy};
+      mlir::Value boxSize =
+          computeBoxSize(loc, boxTypePair, llvmValue, rewriter);
+      newOp = rewriter.create<mlir::LLVM::MemcpyOp>(
+          loc, llvmMemref, llvmValue, boxSize, /*isVolatile=*/false);
    } else {
      newOp = rewriter.create<mlir::LLVM::StoreOp>(loc, llvmValue, llvmMemref);
    }