[flang-rt] Optimise ShallowCopy and use it in CopyInAssign (#140569)

Using Descriptor.Element<>() when iterating through a rank-1 array is
currently inefficient, because the generic implementation suitable for
arrays of any rank makes the compiler unable to perform optimisations
that would make the rank-1 case considerably faster.

This is currently done inside ShallowCopy, as well as by CopyInAssign,
where the implementation of elemental copies (inside Assign) is
equivalent to ShallowCopyDiscontiguousToDiscontiguous.

To address that, add a DescriptorIterator abstraction specialised for
arrays of various ranks, and use that throughout ShallowCopy to iterate
over the arrays.

Furthermore, depending on the pointer type passed to memcpy, the
optimiser can remove the memcpy calls from ShallowCopy altogether which
can result in substantial performance improvements on its own.
Specialise ShallowCopy for various element pointer types to make these
optimisations possible.

Finally, replace the call to Assign inside CopyInAssign with a call to
newly optimised ShallowCopy.

For the thornado-mini application, this reduces the runtime by 27.7%.

---------

Signed-off-by: Kajetan Puchalski <kajetan.puchalski@arm.com>
This commit is contained in:
Kajetan Puchalski
2025-05-22 15:11:46 +01:00
committed by GitHub
parent 6375a8508e
commit c2892b0bdf
6 changed files with 232 additions and 23 deletions

View File

@@ -437,6 +437,64 @@ private:
};
static_assert(sizeof(Descriptor) == sizeof(ISO::CFI_cdesc_t));
// Lightweight iterator-like API to simplify specialising Descriptor indexing
// in cases where it can improve application performance. On account of the
// purpose of this API being performance optimisation, it is up to the user to
// do all the necessary checks to make sure the specialised variants can be used
// safely and that Advance() is not called more times than the number of
// elements in the Descriptor allows for.
// Default RANK=-1 supports aray descriptors of any rank up to maxRank.
template <int RANK = -1> class DescriptorIterator {
private:
const Descriptor &descriptor;
SubscriptValue subscripts[maxRank];
std::size_t elementOffset{0};
public:
RT_API_ATTRS DescriptorIterator(const Descriptor &descriptor)
: descriptor(descriptor) {
// We do not need the subscripts to iterate over a rank-1 array
if constexpr (RANK != 1) {
descriptor.GetLowerBounds(subscripts);
}
};
template <typename A> RT_API_ATTRS A *Get() {
std::size_t offset{0};
// The rank-1 case doesn't require looping at all
if constexpr (RANK == 1) {
offset = elementOffset;
// The compiler might be able to optimise this better if we know the rank
// at compile time
} else if constexpr (RANK != -1) {
for (int j{0}; j < RANK; ++j) {
offset += descriptor.SubscriptByteOffset(j, subscripts[j]);
}
// General fallback
} else {
offset = descriptor.SubscriptsToByteOffset(subscripts);
}
return descriptor.OffsetElement<A>(offset);
}
RT_API_ATTRS void Advance() {
if constexpr (RANK == 1) {
elementOffset += descriptor.GetDimension(0).ByteStride();
} else if constexpr (RANK != -1) {
for (int j{0}; j < RANK; ++j) {
const Dimension &dim{descriptor.GetDimension(j)};
if (subscripts[j]++ < dim.UpperBound()) {
break;
}
subscripts[j] = dim.LowerBound();
}
} else {
descriptor.IncrementSubscripts(subscripts);
}
}
};
// Properly configured instances of StaticDescriptor will occupy the
// exact amount of storage required for the descriptor, its dimensional
// information, and possible addendum. To build such a static descriptor,

View File

@@ -511,10 +511,13 @@ inline RT_API_ATTRS const char *FindCharacter(
// Copy payload data from one allocated descriptor to another.
// Assumes element counts and element sizes match, and that both
// descriptors are allocated.
template <typename P = char, int RANK = -1>
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from);
template <typename P = char, int RANK = -1>
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
const Descriptor &to, const Descriptor &from);
template <typename P = char, int RANK = -1>
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from);
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,

View File

@@ -497,7 +497,7 @@ RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
}
} else { // elemental copies, possibly with character truncation
for (std::size_t n{toElements}; n-- > 0;
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
toElementBytes);
}
@@ -591,7 +591,8 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
temp = var;
temp.set_base_addr(nullptr);
temp.raw().attribute = CFI_attribute_allocatable;
RTNAME(AssignTemporary)(temp, var, sourceFile, sourceLine);
temp.Allocate(kNoAsyncId);
ShallowCopy(temp, var);
}
void RTDEF(CopyOutAssign)(
@@ -600,9 +601,10 @@ void RTDEF(CopyOutAssign)(
// Copyout from the temporary must not cause any finalizations
// for LHS. The variable must be properly initialized already.
if (var)
Assign(*var, temp, terminator, NoAssignFlags);
temp.Destroy(/*finalize=*/false, /*destroyPointers=*/false, &terminator);
if (var) {
ShallowCopy(*var, temp);
}
temp.Deallocate();
}
void RTDEF(AssignExplicitLengthCharacter)(Descriptor &to,

View File

@@ -114,61 +114,151 @@ RT_API_ATTRS void CheckIntegerKind(
}
}
template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
SubscriptValue toAt[maxRank], fromAt[maxRank];
to.GetLowerBounds(toAt);
from.GetLowerBounds(fromAt);
DescriptorIterator<RANK> toIt{to};
DescriptorIterator<RANK> fromIt{from};
// Knowing the size at compile time can enable memcpy inlining optimisations
constexpr std::size_t typeElementBytes{sizeof(P)};
// We might still need to check the actual size as a fallback
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
std::memcpy(
to.Element<char>(toAt), from.Element<char>(fromAt), elementBytes);
toIt.Advance(), fromIt.Advance()) {
// typeElementBytes == 1 when P is a char - the non-specialised case
if constexpr (typeElementBytes != 1) {
std::memcpy(
toIt.template Get<P>(), fromIt.template Get<P>(), typeElementBytes);
} else {
std::memcpy(
toIt.template Get<P>(), fromIt.template Get<P>(), elementBytes);
}
}
}
template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyDiscontiguousToContiguous(
const Descriptor &to, const Descriptor &from) {
char *toAt{to.OffsetElement()};
SubscriptValue fromAt[maxRank];
from.GetLowerBounds(fromAt);
constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
DescriptorIterator<RANK> fromIt{from};
for (std::size_t n{to.Elements()}; n-- > 0;
toAt += elementBytes, from.IncrementSubscripts(fromAt)) {
std::memcpy(toAt, from.Element<char>(fromAt), elementBytes);
toAt += elementBytes, fromIt.Advance()) {
if constexpr (typeElementBytes != 1) {
std::memcpy(toAt, fromIt.template Get<P>(), typeElementBytes);
} else {
std::memcpy(toAt, fromIt.template Get<P>(), elementBytes);
}
}
}
template <typename P, int RANK>
RT_API_ATTRS void ShallowCopyContiguousToDiscontiguous(
const Descriptor &to, const Descriptor &from) {
SubscriptValue toAt[maxRank];
to.GetLowerBounds(toAt);
char *fromAt{from.OffsetElement()};
DescriptorIterator<RANK> toIt{to};
constexpr std::size_t typeElementBytes{sizeof(P)};
std::size_t elementBytes{to.ElementBytes()};
for (std::size_t n{to.Elements()}; n-- > 0;
to.IncrementSubscripts(toAt), fromAt += elementBytes) {
std::memcpy(to.Element<char>(toAt), fromAt, elementBytes);
toIt.Advance(), fromAt += elementBytes) {
if constexpr (typeElementBytes != 1) {
std::memcpy(toIt.template Get<P>(), fromAt, typeElementBytes);
} else {
std::memcpy(toIt.template Get<P>(), fromAt, elementBytes);
}
}
}
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
// ShallowCopy helper for calling the correct specialised variant based on
// scenario
template <typename P, int RANK = -1>
RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
if (toIsContiguous) {
if (fromIsContiguous) {
std::memcpy(to.OffsetElement(), from.OffsetElement(),
to.Elements() * to.ElementBytes());
} else {
ShallowCopyDiscontiguousToContiguous(to, from);
ShallowCopyDiscontiguousToContiguous<P, RANK>(to, from);
}
} else {
if (fromIsContiguous) {
ShallowCopyContiguousToDiscontiguous(to, from);
ShallowCopyContiguousToDiscontiguous<P, RANK>(to, from);
} else {
ShallowCopyDiscontiguousToDiscontiguous(to, from);
ShallowCopyDiscontiguousToDiscontiguous<P, RANK>(to, from);
}
}
}
// Most arrays are much closer to rank-1 than to maxRank.
// Doing the recursion upwards instead of downwards puts the more common
// cases earlier in the if-chain and has a tangible impact on performance.
template <typename P, int RANK> struct ShallowCopyRankSpecialize {
static bool execute(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
if (to.rank() == RANK && from.rank() == RANK) {
ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
return true;
}
return ShallowCopyRankSpecialize<P, RANK + 1>::execute(
to, from, toIsContiguous, fromIsContiguous);
}
};
template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
static bool execute(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
return false;
}
};
// ShallowCopy helper for specialising the variants based on array rank
template <typename P>
RT_API_ATTRS void ShallowCopyRank(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
// Try to call a specialised ShallowCopy variant from rank-1 up to maxRank
bool specialized{ShallowCopyRankSpecialize<P, 1>::execute(
to, from, toIsContiguous, fromIsContiguous)};
if (!specialized) {
ShallowCopyInner<P>(to, from, toIsContiguous, fromIsContiguous);
}
}
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from,
bool toIsContiguous, bool fromIsContiguous) {
std::size_t elementBytes{to.ElementBytes()};
// Checking the type at runtime and making sure the pointer passed to memcpy
// has a type that matches the element type makes it possible for the compiler
// to optimise out the memcpy calls altogether and can substantially improve
// performance for some applications.
if (to.type().IsInteger()) {
if (elementBytes == sizeof(int64_t)) {
ShallowCopyRank<int64_t>(to, from, toIsContiguous, fromIsContiguous);
} else if (elementBytes == sizeof(int32_t)) {
ShallowCopyRank<int32_t>(to, from, toIsContiguous, fromIsContiguous);
} else if (elementBytes == sizeof(int16_t)) {
ShallowCopyRank<int16_t>(to, from, toIsContiguous, fromIsContiguous);
#if defined USING_NATIVE_INT128_T
} else if (elementBytes == sizeof(__int128_t)) {
ShallowCopyRank<__int128_t>(to, from, toIsContiguous, fromIsContiguous);
#endif
} else {
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
}
} else if (to.type().IsReal()) {
if (elementBytes == sizeof(double)) {
ShallowCopyRank<double>(to, from, toIsContiguous, fromIsContiguous);
} else if (elementBytes == sizeof(float)) {
ShallowCopyRank<float>(to, from, toIsContiguous, fromIsContiguous);
} else {
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
}
} else {
ShallowCopyRank<char>(to, from, toIsContiguous, fromIsContiguous);
}
}
RT_API_ATTRS void ShallowCopy(const Descriptor &to, const Descriptor &from) {
ShallowCopy(to, from, to.IsContiguous(), from.IsContiguous());
}

View File

@@ -0,0 +1,55 @@
//===-- unittests/Runtime/Assign.cpp ------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "flang/Runtime/assign.h"
#include "tools.h"
#include "gtest/gtest.h"
#include <vector>
using namespace Fortran::runtime;
using Fortran::common::TypeCategory;
TEST(Assign, RTNAME(CopyInAssign)) {
// contiguous -> contiguous copy in
auto intArray{MakeArray<TypeCategory::Integer, 1>(
std::vector<int>{2, 3}, std::vector<int>{1, 2, 3, 4, 5, 6}, sizeof(int))};
StaticDescriptor<2> staticIntResult;
Descriptor &intResult{staticIntResult.descriptor()};
RTNAME(CopyInAssign(intResult, *intArray));
ASSERT_TRUE(intResult.IsAllocated());
ASSERT_TRUE(intResult.IsContiguous());
ASSERT_EQ(intResult.type(), intArray->type());
ASSERT_EQ(intResult.ElementBytes(), sizeof(int));
EXPECT_EQ(intResult.GetDimension(0).LowerBound(), 1);
EXPECT_EQ(intResult.GetDimension(0).Extent(), 2);
EXPECT_EQ(intResult.GetDimension(1).LowerBound(), 1);
EXPECT_EQ(intResult.GetDimension(1).Extent(), 3);
int expected[6] = {1, 2, 3, 4, 5, 6};
EXPECT_EQ(
std::memcmp(intResult.OffsetElement<int>(0), expected, 6 * sizeof(int)),
0);
intResult.Destroy();
// discontiguous -> contiguous rank-1 copy in
intArray = MakeArray<TypeCategory::Integer, 1>(std::vector<int>{8},
std::vector<int>{1, 2, 3, 4, 5, 6, 7, 8}, sizeof(int));
StaticDescriptor<1> staticIntResultStrided;
Descriptor &intResultStrided{staticIntResultStrided.descriptor()};
// Treat the descriptor as a strided array of 4
intArray->GetDimension(0).SetByteStride(sizeof(int) * 2);
intArray->GetDimension(0).SetExtent(4);
RTNAME(CopyInAssign(intResultStrided, *intArray));
int expectedStrided[4] = {1, 3, 5, 7};
EXPECT_EQ(std::memcmp(intResultStrided.OffsetElement<int>(0), expectedStrided,
4 * sizeof(int)),
0);
intResultStrided.Destroy();
}

View File

@@ -10,6 +10,7 @@ add_flangrt_unittest(RuntimeTests
AccessTest.cpp
Allocatable.cpp
ArrayConstructor.cpp
Assign.cpp
BufferTest.cpp
CharacterTest.cpp
CommandTest.cpp