Files
clang-p2996/flang/runtime/reduction-templates.h
Peter Klausler 3ada883f7c [flang][runtime] Runtime support for REDUCE() (#86214)
Supports the REDUCE() transformational intrinsic function of Fortran
(see F'2023 16.9.173) in a manner similar to the existing support for
SUM(), PRODUCT(), &c. There are APIs for total reductions to scalar
results, and APIs for partial reductions that reduce the rank of the
argument by one.

This implementation requires more functions than other reductions
because the various possible types of the user-supplied OPERATION=
function need to be elaborated.

Once the basic API in reduce.h has been approved, later patches will
implement lowering.

REDUCE() is primarily for completeness, not portability; only one other
Fortran compiler implements this F'2018 feature today, and only some
types work correctly with it.
2024-03-26 09:21:16 -07:00

417 lines
16 KiB
C++

//===-- runtime/reduction-templates.h ---------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// Generic function templates used by various reduction transformation
// intrinsic functions (SUM, PRODUCT, &c.)
//
// * Partial reductions (i.e., those with DIM= arguments that are not
// required to be 1 by the rank of the argument) return arrays that
// are dynamically allocated in a caller-supplied descriptor.
// * Total reductions (i.e., no DIM= argument) with FINDLOC, MAXLOC, & MINLOC
// return integer vectors of some kind, not scalars; a caller-supplied
// descriptor is used
// * Character-valued reductions (MAXVAL & MINVAL) return arbitrary
// length results, dynamically allocated in a caller-supplied descriptor
#ifndef FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
#define FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_
#include "numeric-templates.h"
#include "terminator.h"
#include "tools.h"
#include "flang/Runtime/cpp-type.h"
#include "flang/Runtime/descriptor.h"
#include <algorithm>
namespace Fortran::runtime {
// Reductions are implemented with *accumulators*, which are instances of
// classes that incrementally build up the result (or an element thereof) during
// a traversal of the unmasked elements of an array. Each accumulator class
// supports a constructor (which captures a reference to the array), an
// AccumulateAt() member function that applies supplied subscripts to the
// array and does something with a scalar element, and a GetResult()
// member function that copies a final result into its destination.
// Total reduction of the array argument to a scalar (or to a vector in the
// cases of FINDLOC, MAXLOC, & MINLOC). These are the cases without DIM= or
// cases where the argument has rank 1 and DIM=, if present, must be 1.
template <typename TYPE, typename ACCUMULATOR>
inline RT_API_ATTRS void DoTotalReduction(const Descriptor &x, int dim,
const Descriptor *mask, ACCUMULATOR &accumulator, const char *intrinsic,
Terminator &terminator) {
if (dim < 0 || dim > 1) {
terminator.Crash("%s: bad DIM=%d for ARRAY argument with rank %d",
intrinsic, dim, x.rank());
}
SubscriptValue xAt[maxRank];
x.GetLowerBounds(xAt);
if (mask) {
CheckConformability(x, *mask, terminator, intrinsic, "ARRAY", "MASK");
if (mask->rank() > 0) {
SubscriptValue maskAt[maxRank];
mask->GetLowerBounds(maskAt);
for (auto elements{x.Elements()}; elements--;
x.IncrementSubscripts(xAt), mask->IncrementSubscripts(maskAt)) {
if (IsLogicalElementTrue(*mask, maskAt)) {
if (!accumulator.template AccumulateAt<TYPE>(xAt)) {
break;
}
}
}
return;
} else if (!IsLogicalScalarTrue(*mask)) {
// scalar MASK=.FALSE.: return identity value
return;
}
}
// No MASK=, or scalar MASK=.TRUE.
for (auto elements{x.Elements()}; elements--; x.IncrementSubscripts(xAt)) {
if (!accumulator.template AccumulateAt<TYPE>(xAt)) {
break; // cut short, result is known
}
}
}
template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
inline RT_API_ATTRS CppTypeFor<CAT, KIND> GetTotalReduction(const Descriptor &x,
const char *source, int line, int dim, const Descriptor *mask,
ACCUMULATOR &&accumulator, const char *intrinsic) {
Terminator terminator{source, line};
RUNTIME_CHECK(terminator, TypeCode(CAT, KIND) == x.type());
using CppType = CppTypeFor<CAT, KIND>;
DoTotalReduction<CppType>(x, dim, mask, accumulator, intrinsic, terminator);
if constexpr (std::is_void_v<CppType>) {
// Result is returned from accumulator, as in REDUCE() for derived type
#ifdef _MSC_VER // work around MSVC spurious error
accumulator.GetResult();
#else
accumulator.template GetResult();
#endif
} else {
CppType result;
#ifdef _MSC_VER // work around MSVC spurious error
accumulator.GetResult(&result);
#else
accumulator.template GetResult(&result);
#endif
return result;
}
}
// For reductions on a dimension, e.g. SUM(array,DIM=2) where the shape
// of the array is [2,3,5], the shape of the result is [2,5] and
// result(j,k) = SUM(array(j,:,k)), possibly modified if the array has
// lower bounds other than one. This utility subroutine creates an
// array of subscripts [j,_,k] for result subscripts [j,k] so that the
// elements of array(j,:,k) can be reduced.
inline RT_API_ATTRS void GetExpandedSubscripts(SubscriptValue at[],
const Descriptor &descriptor, int zeroBasedDim,
const SubscriptValue from[]) {
descriptor.GetLowerBounds(at);
int rank{descriptor.rank()};
int j{0};
for (; j < zeroBasedDim; ++j) {
at[j] += from[j] - 1 /*lower bound*/;
}
for (++j; j < rank; ++j) {
at[j] += from[j - 1] - 1;
}
}
template <typename TYPE, typename ACCUMULATOR>
inline RT_API_ATTRS void ReduceDimToScalar(const Descriptor &x,
int zeroBasedDim, SubscriptValue subscripts[], TYPE *result,
ACCUMULATOR &accumulator) {
SubscriptValue xAt[maxRank];
GetExpandedSubscripts(xAt, x, zeroBasedDim, subscripts);
const auto &dim{x.GetDimension(zeroBasedDim)};
SubscriptValue at{dim.LowerBound()};
for (auto n{dim.Extent()}; n-- > 0; ++at) {
xAt[zeroBasedDim] = at;
if (!accumulator.template AccumulateAt<TYPE>(xAt)) {
break;
}
}
#ifdef _MSC_VER // work around MSVC spurious error
accumulator.GetResult(result, zeroBasedDim);
#else
accumulator.template GetResult(result, zeroBasedDim);
#endif
}
template <typename TYPE, typename ACCUMULATOR>
inline RT_API_ATTRS void ReduceDimMaskToScalar(const Descriptor &x,
int zeroBasedDim, SubscriptValue subscripts[], const Descriptor &mask,
TYPE *result, ACCUMULATOR &accumulator) {
SubscriptValue xAt[maxRank], maskAt[maxRank];
GetExpandedSubscripts(xAt, x, zeroBasedDim, subscripts);
GetExpandedSubscripts(maskAt, mask, zeroBasedDim, subscripts);
const auto &xDim{x.GetDimension(zeroBasedDim)};
SubscriptValue xPos{xDim.LowerBound()};
const auto &maskDim{mask.GetDimension(zeroBasedDim)};
SubscriptValue maskPos{maskDim.LowerBound()};
for (auto n{x.GetDimension(zeroBasedDim).Extent()}; n-- > 0;
++xPos, ++maskPos) {
maskAt[zeroBasedDim] = maskPos;
if (IsLogicalElementTrue(mask, maskAt)) {
xAt[zeroBasedDim] = xPos;
if (!accumulator.template AccumulateAt<TYPE>(xAt)) {
break;
}
}
}
#ifdef _MSC_VER // work around MSVC spurious error
accumulator.GetResult(result, zeroBasedDim);
#else
accumulator.template GetResult(result, zeroBasedDim);
#endif
}
// Partial reductions with DIM=
template <typename ACCUMULATOR, TypeCategory CAT, int KIND>
inline RT_API_ATTRS void PartialReduction(Descriptor &result,
const Descriptor &x, std::size_t resultElementSize, int dim,
const Descriptor *mask, Terminator &terminator, const char *intrinsic,
ACCUMULATOR &accumulator) {
CreatePartialReductionResult(result, x, resultElementSize, dim, terminator,
intrinsic, TypeCode{CAT, KIND});
SubscriptValue at[maxRank];
result.GetLowerBounds(at);
INTERNAL_CHECK(result.rank() == 0 || at[0] == 1);
using CppType = CppTypeFor<CAT, KIND>;
if (mask) {
CheckConformability(x, *mask, terminator, intrinsic, "ARRAY", "MASK");
if (mask->rank() > 0) {
for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) {
accumulator.Reinitialize();
ReduceDimMaskToScalar<CppType, ACCUMULATOR>(
x, dim - 1, at, *mask, result.Element<CppType>(at), accumulator);
}
return;
} else if (!IsLogicalScalarTrue(*mask)) {
// scalar MASK=.FALSE.
accumulator.Reinitialize();
for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) {
accumulator.GetResult(result.Element<CppType>(at));
}
return;
}
}
// No MASK= or scalar MASK=.TRUE.
for (auto n{result.Elements()}; n-- > 0; result.IncrementSubscripts(at)) {
accumulator.Reinitialize();
ReduceDimToScalar<CppType, ACCUMULATOR>(
x, dim - 1, at, result.Element<CppType>(at), accumulator);
}
}
template <template <typename> class ACCUM>
struct PartialIntegerReductionHelper {
template <int KIND> struct Functor {
static constexpr int Intermediate{
std::max(KIND, 4)}; // use at least "int" for intermediate results
RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
int dim, const Descriptor *mask, Terminator &terminator,
const char *intrinsic) const {
using Accumulator =
ACCUM<CppTypeFor<TypeCategory::Integer, Intermediate>>;
Accumulator accumulator{x};
// Element size of the destination descriptor is the same
// as the element size of the source.
PartialReduction<Accumulator, TypeCategory::Integer, KIND>(result, x,
x.ElementBytes(), dim, mask, terminator, intrinsic, accumulator);
}
};
};
template <template <typename> class INTEGER_ACCUM>
inline RT_API_ATTRS void PartialIntegerReduction(Descriptor &result,
const Descriptor &x, int dim, int kind, const Descriptor *mask,
const char *intrinsic, Terminator &terminator) {
ApplyIntegerKind<
PartialIntegerReductionHelper<INTEGER_ACCUM>::template Functor, void>(
kind, terminator, result, x, dim, mask, terminator, intrinsic);
}
template <TypeCategory CAT, template <typename> class ACCUM>
struct PartialFloatingReductionHelper {
template <int KIND> struct Functor {
static constexpr int Intermediate{
std::max(KIND, 8)}; // use at least "double" for intermediate results
RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
int dim, const Descriptor *mask, Terminator &terminator,
const char *intrinsic) const {
using Accumulator = ACCUM<CppTypeFor<TypeCategory::Real, Intermediate>>;
Accumulator accumulator{x};
// Element size of the destination descriptor is the same
// as the element size of the source.
PartialReduction<Accumulator, CAT, KIND>(result, x, x.ElementBytes(), dim,
mask, terminator, intrinsic, accumulator);
}
};
};
template <template <typename> class INTEGER_ACCUM,
template <typename> class REAL_ACCUM,
template <typename> class COMPLEX_ACCUM>
inline RT_API_ATTRS void TypedPartialNumericReduction(Descriptor &result,
const Descriptor &x, int dim, const char *source, int line,
const Descriptor *mask, const char *intrinsic) {
Terminator terminator{source, line};
auto catKind{x.type().GetCategoryAndKind()};
RUNTIME_CHECK(terminator, catKind.has_value());
switch (catKind->first) {
case TypeCategory::Integer:
PartialIntegerReduction<INTEGER_ACCUM>(
result, x, dim, catKind->second, mask, intrinsic, terminator);
break;
case TypeCategory::Real:
ApplyFloatingPointKind<PartialFloatingReductionHelper<TypeCategory::Real,
REAL_ACCUM>::template Functor,
void>(catKind->second, terminator, result, x, dim, mask, terminator,
intrinsic);
break;
case TypeCategory::Complex:
ApplyFloatingPointKind<PartialFloatingReductionHelper<TypeCategory::Complex,
COMPLEX_ACCUM>::template Functor,
void>(catKind->second, terminator, result, x, dim, mask, terminator,
intrinsic);
break;
default:
terminator.Crash("%s: bad type code %d", intrinsic, x.type().raw());
}
}
template <typename ACCUMULATOR> struct LocationResultHelper {
template <int KIND> struct Functor {
RT_API_ATTRS void operator()(
ACCUMULATOR &accumulator, const Descriptor &result) const {
accumulator.GetResult(
result.OffsetElement<CppTypeFor<TypeCategory::Integer, KIND>>());
}
};
};
template <typename ACCUMULATOR> struct PartialLocationHelper {
template <int KIND> struct Functor {
RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x,
int dim, const Descriptor *mask, Terminator &terminator,
const char *intrinsic, ACCUMULATOR &accumulator) const {
// Element size of the destination descriptor is the size
// of {TypeCategory::Integer, KIND}.
PartialReduction<ACCUMULATOR, TypeCategory::Integer, KIND>(result, x,
Descriptor::BytesFor(TypeCategory::Integer, KIND), dim, mask,
terminator, intrinsic, accumulator);
}
};
};
// NORM2 templates
RT_VAR_GROUP_BEGIN
// Use at least double precision for accumulators.
// Don't use __float128, it doesn't work with abs() or sqrt() yet.
static constexpr RT_CONST_VAR_ATTRS int Norm2LargestLDKind {
#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
16
#elif LDBL_MANT_DIG == 64
10
#else
8
#endif
};
RT_VAR_GROUP_END
template <TypeCategory CAT, int KIND, typename ACCUMULATOR>
inline RT_API_ATTRS void DoMaxMinNorm2(Descriptor &result, const Descriptor &x,
int dim, const Descriptor *mask, const char *intrinsic,
Terminator &terminator) {
using Type = CppTypeFor<CAT, KIND>;
ACCUMULATOR accumulator{x};
if (dim == 0 || x.rank() == 1) {
// Total reduction
// Element size of the destination descriptor is the same
// as the element size of the source.
result.Establish(x.type(), x.ElementBytes(), nullptr, 0, nullptr,
CFI_attribute_allocatable);
if (int stat{result.Allocate()}) {
terminator.Crash(
"%s: could not allocate memory for result; STAT=%d", intrinsic, stat);
}
DoTotalReduction<Type>(x, dim, mask, accumulator, intrinsic, terminator);
accumulator.GetResult(result.OffsetElement<Type>());
} else {
// Partial reduction
// Element size of the destination descriptor is the same
// as the element size of the source.
PartialReduction<ACCUMULATOR, CAT, KIND>(result, x, x.ElementBytes(), dim,
mask, terminator, intrinsic, accumulator);
}
}
// The data type used by Norm2Accumulator.
template <int KIND>
using Norm2AccumType =
CppTypeFor<TypeCategory::Real, std::clamp(KIND, 8, Norm2LargestLDKind)>;
template <int KIND> class Norm2Accumulator {
public:
using Type = CppTypeFor<TypeCategory::Real, KIND>;
using AccumType = Norm2AccumType<KIND>;
explicit RT_API_ATTRS Norm2Accumulator(const Descriptor &array)
: array_{array} {}
RT_API_ATTRS void Reinitialize() { max_ = sum_ = 0; }
template <typename A>
RT_API_ATTRS void GetResult(A *p, int /*zeroBasedDim*/ = -1) const {
// m * sqrt(1 + sum((others(:)/m)**2))
*p = static_cast<Type>(max_ * SQRTTy<AccumType>::compute(1 + sum_));
}
RT_API_ATTRS bool Accumulate(Type x) {
auto absX{ABSTy<AccumType>::compute(static_cast<AccumType>(x))};
if (!max_) {
max_ = absX;
} else if (absX > max_) {
auto t{max_ / absX}; // < 1.0
auto tsq{t * t};
sum_ *= tsq; // scale sum to reflect change to the max
sum_ += tsq; // include a term for the previous max
max_ = absX;
} else { // absX <= max_
auto t{absX / max_};
sum_ += t * t;
}
return true;
}
template <typename A>
RT_API_ATTRS bool AccumulateAt(const SubscriptValue at[]) {
return Accumulate(*array_.Element<A>(at));
}
private:
const Descriptor &array_;
AccumType max_{0}; // value (m) with largest magnitude
AccumType sum_{0}; // sum((others(:)/m)**2)
};
template <int KIND> struct Norm2Helper {
RT_API_ATTRS void operator()(Descriptor &result, const Descriptor &x, int dim,
const Descriptor *mask, Terminator &terminator) const {
DoMaxMinNorm2<TypeCategory::Real, KIND, Norm2Accumulator<KIND>>(
result, x, dim, mask, "NORM2", terminator);
}
};
} // namespace Fortran::runtime
#endif // FORTRAN_RUNTIME_REDUCTION_TEMPLATES_H_