A new rule is added in 5.0: If a list item appears in a reduction, lastprivate or linear clause on a combined target construct then it is treated as if it also appears in a map clause with a map-type of tofrom. Currently map clauses for all capture variables are added implicitly. But missing for list item of expression for array elements or array sections. The change is to add implicit map clause for array of elements used in reduction clause. Skip adding map clause if the expression is not mappable. Noted: For linear and lastprivate, since only variable name is accepted, the map has been added though capture variables. To do so: During the mappable checking, if error, ignore diagnose and skip adding implicit map clause. The changes: 1> Add code to generate implicit map in ActOnOpenMPExecutableDirective, for omp 5.0 and up. 2> Add extra default parameter NoDiagnose in ActOnOpenMPMapClause: Use that to skip error as well as skip adding implicit map during the mappable checking. Note: there are only tow places need to be check for NoDiagnose. Rest of them either the check is for < omp 5.0 or the error already generated for reduction clause. Differential Revision: https://reviews.llvm.org/D108132
123 lines
4.5 KiB
C++
123 lines
4.5 KiB
C++
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
|
|
// RUN: -triple powerpc64le-unknown-unknown -DCUDA \
|
|
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o \
|
|
// RUN: %t-ppc-host.bc
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ \
|
|
// RUN: -triple nvptx64-unknown-unknown -DCUA \
|
|
// RUN: -fopenmp-targets=nvptx64-nvidia-cuda -DCUDA -emit-llvm %s \
|
|
// RUN: -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc \
|
|
// RUN: -o - | FileCheck %s --check-prefix CHECK
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ \
|
|
// RUN: -triple powerpc64le-unknown-unknown -DDIAG\
|
|
// RUN: -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm \
|
|
// RUN: %s -o - | FileCheck %s \
|
|
// RUN: --check-prefix=CHECK1
|
|
|
|
// RUN: %clang_cc1 -verify -fopenmp -x c++ \
|
|
// RUN: -triple i386-unknown-unknown \
|
|
// RUN: -fopenmp-targets=i386-pc-linux-gnu -emit-llvm \
|
|
// RUN: %s -o - | FileCheck %s \
|
|
// RUN: --check-prefix=CHECK2
|
|
|
|
|
|
#if defined(CUDA)
|
|
// expected-no-diagnostics
|
|
|
|
int foo(int n) {
|
|
double *e;
|
|
//no error and no implicit map generated for e[:1]
|
|
#pragma omp target parallel reduction(+: e[:1])
|
|
*e=10;
|
|
;
|
|
return 0;
|
|
}
|
|
// CHECK-NOT @.offload_maptypes
|
|
// CHECK: call void @__kmpc_nvptx_end_reduce_nowait(
|
|
#elif defined(DIAG)
|
|
class S2 {
|
|
mutable int a;
|
|
public:
|
|
S2():a(0) { }
|
|
S2(S2 &s2):a(s2.a) { }
|
|
S2 &operator +(S2 &s);
|
|
};
|
|
int bar() {
|
|
S2 o[5];
|
|
//warnig "copyable and not guaranteed to be mapped correctly" and
|
|
//implicit map generated.
|
|
#pragma omp target parallel reduction(+:o[0]) //expected-warning {{Type 'S2' is not trivially copyable and not guaranteed to be mapped correctly}}
|
|
for (int i = 0; i < 10; i++);
|
|
double b[10][10][10];
|
|
//no error no implicit map generated, the map for b is generated but not
|
|
//for b[0:2][2:4][1].
|
|
#pragma omp target parallel for reduction(task, +: b[0:2][2:4][1])
|
|
for (long long i = 0; i < 10; ++i);
|
|
return 0;
|
|
}
|
|
// map for variable o
|
|
// CHECK1: offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
|
|
// CHECK1: offload_maptypes = private unnamed_addr constant [1 x i64] [i64 547]
|
|
// map for b:
|
|
// CHECK1: @.offload_sizes{{.*}} = private unnamed_addr constant [1 x i64] [i64 8000]
|
|
// CHECK1: @.offload_maptypes{{.*}} = private unnamed_addr constant [1 x i64] [i64 547]
|
|
#else
|
|
// expected-no-diagnostics
|
|
|
|
// generate implicit map for array elements or array sections in reduction
|
|
// clause. In following case: the implicit map is generate for output[0]
|
|
// with map size 4 and output[:3] with map size 12.
|
|
void sum(int* input, int size, int* output)
|
|
{
|
|
#pragma omp target teams distribute parallel for reduction(+: output[0]) \
|
|
map(to: input [0:size])
|
|
for (int i = 0; i < size; i++)
|
|
output[0] += input[i];
|
|
#pragma omp target teams distribute parallel for reduction(+: output[:3]) \
|
|
map(to: input [0:size])
|
|
for (int i = 0; i < size; i++)
|
|
output[0] += input[i];
|
|
int a[10];
|
|
#pragma omp target parallel reduction(+: a[:2])
|
|
for (int i = 0; i < size; i++)
|
|
;
|
|
#pragma omp target parallel reduction(+: a[3])
|
|
for (int i = 0; i < size; i++)
|
|
;
|
|
}
|
|
//CHECK2: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 4, i64 8]
|
|
//CHECK2: @.offload_maptypes.10 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
|
|
//CHECK2: @.offload_sizes.13 = private unnamed_addr constant [2 x i64] [i64 4, i64 4]
|
|
//CHECK2: @.offload_maptypes.14 = private unnamed_addr constant [2 x i64] [i64 800, i64 547]
|
|
//CHECK2: define dso_local void @_Z3sumPiiS_
|
|
//CHECK2-NEXT: entry
|
|
//CHECK2-NEXT: [[INP:%.*]] = alloca i32*
|
|
//CHECK2-NEXT: [[SIZE:%.*]] = alloca i32
|
|
//CHECK2-NEXT: [[OUTP:%.*]] = alloca i32*
|
|
//CHECK2: [[OFFSIZE:%.*]] = alloca [3 x i64]
|
|
//CHECK2: [[OFFSIZE10:%.*]] = alloca [3 x i64]
|
|
//CHECK2: [[T15:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 0
|
|
//CHECK2-NEXT: store i64 4, i64* [[T15]]
|
|
//CHECK2: [[T21:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE]], i32 0, i32 1
|
|
//CHECK2-NEXT: store i64 4, i64* [[T21]]
|
|
//CHECK2: [[T53:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 0
|
|
//CHECK2-NEXT: store i64 4, i64* [[T53]]
|
|
//CHECK2: [[T59:%.*]] = getelementptr inbounds [3 x i64], [3 x i64]* [[OFFSIZE10]], i32 0, i32 1
|
|
//CHECK2-NEXT: store i64 12, i64* [[T59]]
|
|
#endif
|
|
int main()
|
|
{
|
|
#if defined(CUDA)
|
|
int a = foo(10);
|
|
#elif defined(DIAG)
|
|
int a = bar();
|
|
#else
|
|
const int size = 100;
|
|
int *array = new int[size];
|
|
int result = 0;
|
|
sum(array, size, &result);
|
|
#endif
|
|
return 0;
|
|
}
|