Revert "[clang][openmp][NFC] Remove arch-specific CGOpenMPRuntimeGPU files"
This reverts commit 81a7cad2ff.
This commit is contained in:
48
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
Normal file
48
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.cpp
Normal file
@@ -0,0 +1,48 @@
|
||||
//===-- CGOpenMPRuntimeAMDGCN.cpp - Interface to OpenMP AMDGCN Runtimes --===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This provides a class for OpenMP runtime code generation specialized to
|
||||
// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CGOpenMPRuntimeAMDGCN.h"
|
||||
#include "CGOpenMPRuntimeGPU.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/Attr.h"
|
||||
#include "clang/AST/DeclOpenMP.h"
|
||||
#include "clang/AST/StmtOpenMP.h"
|
||||
#include "clang/AST/StmtVisitor.h"
|
||||
#include "clang/Basic/Cuda.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
#include "llvm/IR/IntrinsicsAMDGPU.h"
|
||||
|
||||
using namespace clang;
|
||||
using namespace CodeGen;
|
||||
using namespace llvm::omp;
|
||||
|
||||
CGOpenMPRuntimeAMDGCN::CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM)
|
||||
: CGOpenMPRuntimeGPU(CGM) {
|
||||
if (!CGM.getLangOpts().OpenMPIsDevice)
|
||||
llvm_unreachable("OpenMP AMDGCN can only handle device code.");
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUWarpSize(CodeGenFunction &CGF) {
|
||||
CGBuilderTy &Bld = CGF.Builder;
|
||||
// return constant compile-time target-specific warp size
|
||||
unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
|
||||
return Bld.getInt32(WarpSize);
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeAMDGCN::getGPUThreadID(CodeGenFunction &CGF) {
|
||||
CGBuilderTy &Bld = CGF.Builder;
|
||||
llvm::Function *F =
|
||||
CGF.CGM.getIntrinsic(llvm::Intrinsic::amdgcn_workitem_id_x);
|
||||
return Bld.CreateCall(F, llvm::None, "nvptx_tid");
|
||||
}
|
||||
40
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
Normal file
40
clang/lib/CodeGen/CGOpenMPRuntimeAMDGCN.h
Normal file
@@ -0,0 +1,40 @@
|
||||
//===--- CGOpenMPRuntimeAMDGCN.h - Interface to OpenMP AMDGCN Runtimes ---===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This provides a class for OpenMP runtime code generation specialized to
|
||||
// AMDGCN targets from generalized CGOpenMPRuntimeGPU class.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
|
||||
#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
|
||||
|
||||
#include "CGOpenMPRuntime.h"
|
||||
#include "CGOpenMPRuntimeGPU.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/StmtOpenMP.h"
|
||||
|
||||
namespace clang {
|
||||
namespace CodeGen {
|
||||
|
||||
class CGOpenMPRuntimeAMDGCN final : public CGOpenMPRuntimeGPU {
|
||||
|
||||
public:
|
||||
explicit CGOpenMPRuntimeAMDGCN(CodeGenModule &CGM);
|
||||
|
||||
/// Get the GPU warp size.
|
||||
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
|
||||
|
||||
/// Get the id of the current thread on the GPU.
|
||||
llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
|
||||
};
|
||||
|
||||
} // namespace CodeGen
|
||||
} // namespace clang
|
||||
|
||||
#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEAMDGCN_H
|
||||
@@ -12,6 +12,7 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CGOpenMPRuntimeGPU.h"
|
||||
#include "CGOpenMPRuntimeNVPTX.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/Attr.h"
|
||||
#include "clang/AST/DeclOpenMP.h"
|
||||
@@ -20,6 +21,7 @@
|
||||
#include "clang/Basic/Cuda.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
#include "llvm/IR/IntrinsicsNVPTX.h"
|
||||
#include "llvm/Support/MathExtras.h"
|
||||
|
||||
using namespace clang;
|
||||
@@ -1195,7 +1197,7 @@ unsigned CGOpenMPRuntimeGPU::getDefaultLocationReserved2Flags() const {
|
||||
CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
|
||||
: CGOpenMPRuntime(CGM, "_", "$") {
|
||||
if (!CGM.getLangOpts().OpenMPIsDevice)
|
||||
llvm_unreachable("OpenMP can only handle device code.");
|
||||
llvm_unreachable("OpenMP NVPTX can only handle device code.");
|
||||
|
||||
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
|
||||
if (CGM.getLangOpts().OpenMPTargetNewRuntime) {
|
||||
@@ -3958,18 +3960,3 @@ llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
|
||||
}
|
||||
return Bld.CreateCall(F, llvm::None, "nvptx_num_threads");
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {
|
||||
ArrayRef<llvm::Value *> Args{};
|
||||
return CGF.EmitRuntimeCall(
|
||||
OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
|
||||
Args);
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {
|
||||
ArrayRef<llvm::Value *> Args{};
|
||||
return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
|
||||
CGM.getModule(), OMPRTL___kmpc_get_warp_size),
|
||||
Args);
|
||||
}
|
||||
|
||||
@@ -176,10 +176,10 @@ public:
|
||||
/// and NVPTX.
|
||||
|
||||
/// Get the GPU warp size.
|
||||
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF);
|
||||
virtual llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) = 0;
|
||||
|
||||
/// Get the id of the current thread on the GPU.
|
||||
llvm::Value *getGPUThreadID(CodeGenFunction &CGF);
|
||||
virtual llvm::Value *getGPUThreadID(CodeGenFunction &CGF) = 0;
|
||||
|
||||
/// Get the maximum number of threads in a block of the GPU.
|
||||
llvm::Value *getGPUNumThreads(CodeGenFunction &CGF);
|
||||
|
||||
48
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
Normal file
48
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
Normal file
@@ -0,0 +1,48 @@
|
||||
//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This provides a class for OpenMP runtime code generation specialized to NVPTX
|
||||
// targets from generalized CGOpenMPRuntimeGPU class.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "CGOpenMPRuntimeNVPTX.h"
|
||||
#include "CGOpenMPRuntimeGPU.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/Attr.h"
|
||||
#include "clang/AST/DeclOpenMP.h"
|
||||
#include "clang/AST/StmtOpenMP.h"
|
||||
#include "clang/AST/StmtVisitor.h"
|
||||
#include "clang/Basic/Cuda.h"
|
||||
#include "llvm/ADT/SmallPtrSet.h"
|
||||
#include "llvm/IR/IntrinsicsNVPTX.h"
|
||||
|
||||
using namespace clang;
|
||||
using namespace CodeGen;
|
||||
using namespace llvm::omp;
|
||||
|
||||
CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
|
||||
: CGOpenMPRuntimeGPU(CGM) {
|
||||
if (!CGM.getLangOpts().OpenMPIsDevice)
|
||||
llvm_unreachable("OpenMP NVPTX can only handle device code.");
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUWarpSize(CodeGenFunction &CGF) {
|
||||
return CGF.EmitRuntimeCall(
|
||||
llvm::Intrinsic::getDeclaration(
|
||||
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
|
||||
"nvptx_warp_size");
|
||||
}
|
||||
|
||||
llvm::Value *CGOpenMPRuntimeNVPTX::getGPUThreadID(CodeGenFunction &CGF) {
|
||||
CGBuilderTy &Bld = CGF.Builder;
|
||||
llvm::Function *F;
|
||||
F = llvm::Intrinsic::getDeclaration(
|
||||
&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x);
|
||||
return Bld.CreateCall(F, llvm::None, "nvptx_tid");
|
||||
}
|
||||
40
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
Normal file
40
clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
Normal file
@@ -0,0 +1,40 @@
|
||||
//===----- CGOpenMPRuntimeNVPTX.h - Interface to OpenMP NVPTX Runtimes ----===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This provides a class for OpenMP runtime code generation specialized to NVPTX
|
||||
// targets from generalized CGOpenMPRuntimeGPU class.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
|
||||
#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
|
||||
|
||||
#include "CGOpenMPRuntime.h"
|
||||
#include "CGOpenMPRuntimeGPU.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "clang/AST/StmtOpenMP.h"
|
||||
|
||||
namespace clang {
|
||||
namespace CodeGen {
|
||||
|
||||
class CGOpenMPRuntimeNVPTX final : public CGOpenMPRuntimeGPU {
|
||||
|
||||
public:
|
||||
explicit CGOpenMPRuntimeNVPTX(CodeGenModule &CGM);
|
||||
|
||||
/// Get the GPU warp size.
|
||||
llvm::Value *getGPUWarpSize(CodeGenFunction &CGF) override;
|
||||
|
||||
/// Get the id of the current thread on the GPU.
|
||||
llvm::Value *getGPUThreadID(CodeGenFunction &CGF) override;
|
||||
};
|
||||
|
||||
} // CodeGen namespace.
|
||||
} // clang namespace.
|
||||
|
||||
#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMENVPTX_H
|
||||
@@ -59,7 +59,9 @@ add_clang_library(clangCodeGen
|
||||
CGObjCRuntime.cpp
|
||||
CGOpenCLRuntime.cpp
|
||||
CGOpenMPRuntime.cpp
|
||||
CGOpenMPRuntimeAMDGCN.cpp
|
||||
CGOpenMPRuntimeGPU.cpp
|
||||
CGOpenMPRuntimeNVPTX.cpp
|
||||
CGRecordLayoutBuilder.cpp
|
||||
CGStmt.cpp
|
||||
CGStmtOpenMP.cpp
|
||||
|
||||
@@ -19,7 +19,8 @@
|
||||
#include "CGObjCRuntime.h"
|
||||
#include "CGOpenCLRuntime.h"
|
||||
#include "CGOpenMPRuntime.h"
|
||||
#include "CGOpenMPRuntimeGPU.h"
|
||||
#include "CGOpenMPRuntimeAMDGCN.h"
|
||||
#include "CGOpenMPRuntimeNVPTX.h"
|
||||
#include "CodeGenFunction.h"
|
||||
#include "CodeGenPGO.h"
|
||||
#include "ConstantEmitter.h"
|
||||
@@ -243,10 +244,14 @@ void CodeGenModule::createOpenMPRuntime() {
|
||||
switch (getTriple().getArch()) {
|
||||
case llvm::Triple::nvptx:
|
||||
case llvm::Triple::nvptx64:
|
||||
assert(getLangOpts().OpenMPIsDevice &&
|
||||
"OpenMP NVPTX is only prepared to deal with device code.");
|
||||
OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
|
||||
break;
|
||||
case llvm::Triple::amdgcn:
|
||||
assert(getLangOpts().OpenMPIsDevice &&
|
||||
"OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
|
||||
OpenMPRuntime.reset(new CGOpenMPRuntimeGPU(*this));
|
||||
"OpenMP AMDGCN is only prepared to deal with device code.");
|
||||
OpenMPRuntime.reset(new CGOpenMPRuntimeAMDGCN(*this));
|
||||
break;
|
||||
default:
|
||||
if (LangOpts.OpenMPSimd)
|
||||
|
||||
@@ -1664,31 +1664,31 @@ int bar(int n){
|
||||
// CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
|
||||
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
|
||||
// CHECK1-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
|
||||
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
// CHECK1-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]]
|
||||
// CHECK1: omp.critical.loop:
|
||||
// CHECK1-NEXT: [[TMP3:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3]], [[NVPTX_NUM_THREADS]]
|
||||
// CHECK1-NEXT: br i1 [[TMP4]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
|
||||
// CHECK1-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
|
||||
// CHECK1-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
|
||||
// CHECK1: omp.critical.test:
|
||||
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]]
|
||||
// CHECK1-NEXT: br i1 [[TMP6]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
|
||||
// CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
|
||||
// CHECK1-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
|
||||
// CHECK1: omp.critical.body:
|
||||
// CHECK1-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
||||
// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
|
||||
// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP0]], align 4
|
||||
// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP9]], 1
|
||||
// CHECK1-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
|
||||
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
||||
// CHECK1-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
|
||||
// CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
|
||||
// CHECK1-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4
|
||||
// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK1-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK1-NEXT: br label [[OMP_CRITICAL_SYNC]]
|
||||
// CHECK1: omp.critical.sync:
|
||||
// CHECK1-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
|
||||
// CHECK1-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK1-NEXT: store i32 [[TMP10]], i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
|
||||
// CHECK1-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK1-NEXT: br label [[OMP_CRITICAL_LOOP]]
|
||||
// CHECK1: omp.critical.exit:
|
||||
// CHECK1-NEXT: ret void
|
||||
@@ -1936,31 +1936,31 @@ int bar(int n){
|
||||
// CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4
|
||||
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4
|
||||
// CHECK2-NEXT: [[TMP1:%.*]] = call i64 @__kmpc_warp_active_thread_mask()
|
||||
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @__kmpc_get_hardware_num_threads_in_block()
|
||||
// CHECK2-NEXT: store i32 0, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP:%.*]]
|
||||
// CHECK2: omp.critical.loop:
|
||||
// CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP3]], [[NVPTX_NUM_THREADS]]
|
||||
// CHECK2-NEXT: br i1 [[TMP4]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
|
||||
// CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], [[NVPTX_NUM_THREADS]]
|
||||
// CHECK2-NEXT: br i1 [[TMP3]], label [[OMP_CRITICAL_TEST:%.*]], label [[OMP_CRITICAL_EXIT:%.*]]
|
||||
// CHECK2: omp.critical.test:
|
||||
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP2]], [[TMP5]]
|
||||
// CHECK2-NEXT: br i1 [[TMP6]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
|
||||
// CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID]], [[TMP4]]
|
||||
// CHECK2-NEXT: br i1 [[TMP5]], label [[OMP_CRITICAL_BODY:%.*]], label [[OMP_CRITICAL_SYNC:%.*]]
|
||||
// CHECK2: omp.critical.body:
|
||||
// CHECK2-NEXT: [[TMP7:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
||||
// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
|
||||
// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP0]], align 4
|
||||
// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP9]], 1
|
||||
// CHECK2-NEXT: [[TMP6:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4
|
||||
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
|
||||
// CHECK2-NEXT: call void @__kmpc_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP0]], align 4
|
||||
// CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP8]], 1
|
||||
// CHECK2-NEXT: store i32 [[INC]], i32* [[TMP0]], align 4
|
||||
// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP8]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK2-NEXT: call void @__kmpc_end_critical(%struct.ident_t* @[[GLOB1]], i32 [[TMP7]], [8 x i32]* @"_gomp_critical_user_$var")
|
||||
// CHECK2-NEXT: br label [[OMP_CRITICAL_SYNC]]
|
||||
// CHECK2: omp.critical.sync:
|
||||
// CHECK2-NEXT: call void @__kmpc_syncwarp(i64 [[TMP1]])
|
||||
// CHECK2-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK2-NEXT: store i32 [[TMP10]], i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP4]], 1
|
||||
// CHECK2-NEXT: store i32 [[TMP9]], i32* [[CRITICAL_COUNTER]], align 4
|
||||
// CHECK2-NEXT: br label [[OMP_CRITICAL_LOOP]]
|
||||
// CHECK2: omp.critical.exit:
|
||||
// CHECK2-NEXT: ret void
|
||||
|
||||
@@ -115,7 +115,7 @@ int bar(int n){
|
||||
// CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64*
|
||||
// CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64*
|
||||
// CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align
|
||||
// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
|
||||
// CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
|
||||
//
|
||||
@@ -320,7 +320,7 @@ int bar(int n){
|
||||
// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align
|
||||
//
|
||||
// CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32
|
||||
// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
|
||||
// CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
|
||||
// CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8
|
||||
@@ -336,7 +336,7 @@ int bar(int n){
|
||||
// CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32*
|
||||
// CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32*
|
||||
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align
|
||||
// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
|
||||
// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
|
||||
//
|
||||
@@ -617,7 +617,7 @@ int bar(int n){
|
||||
// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32*
|
||||
// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align
|
||||
//
|
||||
// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
|
||||
// CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]])
|
||||
//
|
||||
@@ -632,7 +632,7 @@ int bar(int n){
|
||||
// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align
|
||||
//
|
||||
// CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32
|
||||
// CHECK: [[WS32:%.+]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16
|
||||
// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]])
|
||||
// CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16
|
||||
|
||||
@@ -461,7 +461,7 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
|
||||
// CHECK1-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
||||
// CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
|
||||
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK1-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
||||
// CHECK1-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
|
||||
// CHECK1-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 4
|
||||
@@ -520,10 +520,10 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
||||
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
|
||||
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
||||
// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
||||
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
||||
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
||||
@@ -553,7 +553,7 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
||||
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
||||
// CHECK1: then2:
|
||||
// CHECK1: then4:
|
||||
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
||||
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
||||
// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
|
||||
@@ -562,9 +562,9 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: br label [[IFCONT6:%.*]]
|
||||
// CHECK1: else3:
|
||||
// CHECK1: else5:
|
||||
// CHECK1-NEXT: br label [[IFCONT6]]
|
||||
// CHECK1: ifcont4:
|
||||
// CHECK1: ifcont6:
|
||||
// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: br label [[PRECOND]]
|
||||
@@ -1040,7 +1040,7 @@ void test() {
|
||||
// CHECK1-NEXT: br i1 [[TMP24]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
|
||||
// CHECK1: .shuffle.then:
|
||||
// CHECK1-NEXT: [[TMP25:%.*]] = load i64, i64* [[TMP17]], align 8
|
||||
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK1-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK1-NEXT: [[TMP26:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
||||
// CHECK1-NEXT: [[TMP27:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP26]])
|
||||
// CHECK1-NEXT: store i64 [[TMP27]], i64* [[TMP18]], align 8
|
||||
@@ -1101,10 +1101,10 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
||||
// CHECK1-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
|
||||
// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
||||
// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK1-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK1-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
||||
// CHECK1-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
||||
// CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
||||
@@ -1134,7 +1134,7 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
||||
// CHECK1-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
||||
// CHECK1: then2:
|
||||
// CHECK1: then4:
|
||||
// CHECK1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
||||
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
||||
// CHECK1-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
|
||||
@@ -1143,9 +1143,9 @@ void test() {
|
||||
// CHECK1-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: br label [[IFCONT6:%.*]]
|
||||
// CHECK1: else3:
|
||||
// CHECK1: else5:
|
||||
// CHECK1-NEXT: br label [[IFCONT6]]
|
||||
// CHECK1: ifcont4:
|
||||
// CHECK1: ifcont6:
|
||||
// CHECK1-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK1-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK1-NEXT: br label [[PRECOND]]
|
||||
@@ -1695,7 +1695,7 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
|
||||
// CHECK2-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
||||
// CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
|
||||
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK2-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
||||
// CHECK2-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
|
||||
// CHECK2-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 4
|
||||
@@ -1754,10 +1754,10 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
||||
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
|
||||
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
||||
// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
||||
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
||||
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
||||
@@ -1787,7 +1787,7 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
||||
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
||||
// CHECK2: then2:
|
||||
// CHECK2: then4:
|
||||
// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
||||
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
||||
// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
|
||||
@@ -1796,9 +1796,9 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: br label [[IFCONT6:%.*]]
|
||||
// CHECK2: else3:
|
||||
// CHECK2: else5:
|
||||
// CHECK2-NEXT: br label [[IFCONT6]]
|
||||
// CHECK2: ifcont4:
|
||||
// CHECK2: ifcont6:
|
||||
// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: br label [[PRECOND]]
|
||||
@@ -2274,7 +2274,7 @@ void test() {
|
||||
// CHECK2-NEXT: br i1 [[TMP24]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
|
||||
// CHECK2: .shuffle.then:
|
||||
// CHECK2-NEXT: [[TMP25:%.*]] = load i64, i64* [[TMP17]], align 8
|
||||
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK2-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK2-NEXT: [[TMP26:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
||||
// CHECK2-NEXT: [[TMP27:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP26]])
|
||||
// CHECK2-NEXT: store i64 [[TMP27]], i64* [[TMP18]], align 8
|
||||
@@ -2335,10 +2335,10 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
||||
// CHECK2-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
|
||||
// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
||||
// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK2-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK2-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
||||
// CHECK2-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
||||
// CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
||||
@@ -2368,7 +2368,7 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
||||
// CHECK2-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
||||
// CHECK2: then2:
|
||||
// CHECK2: then4:
|
||||
// CHECK2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
||||
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
||||
// CHECK2-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
|
||||
@@ -2377,9 +2377,9 @@ void test() {
|
||||
// CHECK2-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: br label [[IFCONT6:%.*]]
|
||||
// CHECK2: else3:
|
||||
// CHECK2: else5:
|
||||
// CHECK2-NEXT: br label [[IFCONT6]]
|
||||
// CHECK2: ifcont4:
|
||||
// CHECK2: ifcont6:
|
||||
// CHECK2-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK2-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK2-NEXT: br label [[PRECOND]]
|
||||
@@ -2929,7 +2929,7 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP15:%.*]] = bitcast %"class.std::complex"* [[TMP12]] to i64*
|
||||
// CHECK3-NEXT: [[TMP16:%.*]] = bitcast %"class.std::complex"* [[DOTOMP_REDUCTION_ELEMENT]] to i64*
|
||||
// CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 4
|
||||
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK3-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
||||
// CHECK3-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]])
|
||||
// CHECK3-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 4
|
||||
@@ -2988,10 +2988,10 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
||||
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
|
||||
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
||||
// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
||||
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
||||
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
||||
@@ -3021,7 +3021,7 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
||||
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
||||
// CHECK3: then2:
|
||||
// CHECK3: then4:
|
||||
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
||||
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
||||
// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
|
||||
@@ -3030,9 +3030,9 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: br label [[IFCONT6:%.*]]
|
||||
// CHECK3: else3:
|
||||
// CHECK3: else5:
|
||||
// CHECK3-NEXT: br label [[IFCONT6]]
|
||||
// CHECK3: ifcont4:
|
||||
// CHECK3: ifcont6:
|
||||
// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: br label [[PRECOND]]
|
||||
@@ -3508,7 +3508,7 @@ void test() {
|
||||
// CHECK3-NEXT: br i1 [[TMP24]], label [[DOTSHUFFLE_THEN]], label [[DOTSHUFFLE_EXIT:%.*]]
|
||||
// CHECK3: .shuffle.then:
|
||||
// CHECK3-NEXT: [[TMP25:%.*]] = load i64, i64* [[TMP17]], align 8
|
||||
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @__kmpc_get_warp_size()
|
||||
// CHECK3-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
|
||||
// CHECK3-NEXT: [[TMP26:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16
|
||||
// CHECK3-NEXT: [[TMP27:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP25]], i16 [[TMP7]], i16 [[TMP26]])
|
||||
// CHECK3-NEXT: store i64 [[TMP27]], i64* [[TMP18]], align 8
|
||||
@@ -3569,10 +3569,10 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
|
||||
// CHECK3-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8, !tbaa [[TBAA12]]
|
||||
// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK3-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31
|
||||
// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
|
||||
// CHECK3-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK3-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5
|
||||
// CHECK3-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8
|
||||
// CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]*
|
||||
@@ -3602,7 +3602,7 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]]
|
||||
// CHECK3-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]]
|
||||
// CHECK3: then2:
|
||||
// CHECK3: then4:
|
||||
// CHECK3-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]]
|
||||
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0
|
||||
// CHECK3-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8, !tbaa [[TBAA12]]
|
||||
@@ -3611,9 +3611,9 @@ void test() {
|
||||
// CHECK3-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: br label [[IFCONT6:%.*]]
|
||||
// CHECK3: else3:
|
||||
// CHECK3: else5:
|
||||
// CHECK3-NEXT: br label [[IFCONT6]]
|
||||
// CHECK3: ifcont4:
|
||||
// CHECK3: ifcont6:
|
||||
// CHECK3-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1
|
||||
// CHECK3-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
|
||||
// CHECK3-NEXT: br label [[PRECOND]]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -455,8 +455,6 @@ __OMP_RTL(__kmpc_barrier_simple_generic, false, Void, IdentPtr, Int32)
|
||||
__OMP_RTL(__kmpc_warp_active_thread_mask, false, Int64,)
|
||||
__OMP_RTL(__kmpc_syncwarp, false, Void, Int64)
|
||||
|
||||
__OMP_RTL(__kmpc_get_warp_size, false, Int32, )
|
||||
|
||||
__OMP_RTL(__kmpc_is_generic_main_thread_id, false, Int8, Int32)
|
||||
|
||||
__OMP_RTL(__last, false, Void, )
|
||||
|
||||
@@ -277,10 +277,5 @@ __attribute__((noinline)) uint32_t __kmpc_get_hardware_num_threads_in_block() {
|
||||
FunctionTracingRAII();
|
||||
return impl::getNumHardwareThreadsInBlock();
|
||||
}
|
||||
|
||||
__attribute__((noinline)) uint32_t __kmpc_get_warp_size() {
|
||||
FunctionTracingRAII();
|
||||
return impl::getWarpSize();
|
||||
}
|
||||
}
|
||||
#pragma omp end declare target
|
||||
|
||||
@@ -24,7 +24,6 @@ namespace _OMP {
|
||||
__attribute__((used, weak, optnone)) void keepAlive() {
|
||||
__kmpc_get_hardware_thread_id_in_block();
|
||||
__kmpc_get_hardware_num_threads_in_block();
|
||||
__kmpc_get_warp_size();
|
||||
__kmpc_barrier_simple_spmd(nullptr, 0);
|
||||
__kmpc_barrier_simple_generic(nullptr, 0);
|
||||
}
|
||||
|
||||
@@ -133,11 +133,8 @@ EXTERN int __kmpc_get_hardware_num_threads_in_block() {
|
||||
__builtin_amdgcn_workgroup_size_x());
|
||||
}
|
||||
|
||||
EXTERN unsigned __kmpc_get_warp_size() {
|
||||
return WARPSIZE;
|
||||
}
|
||||
|
||||
EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
|
||||
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
|
||||
EXTERN unsigned GetLaneId() {
|
||||
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
///{
|
||||
extern "C" {
|
||||
unsigned GetLaneId();
|
||||
unsigned __kmpc_get_warp_size();
|
||||
unsigned GetWarpSize();
|
||||
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
||||
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
||||
}
|
||||
@@ -60,7 +60,7 @@ int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
int Width = __kmpc_get_warp_size();
|
||||
int Width = GetWarpSize();
|
||||
int Self = GetLaneId();
|
||||
int Index = SrcLane + (Self & ~(Width - 1));
|
||||
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
|
||||
@@ -90,7 +90,7 @@ inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
|
||||
uint32_t Delta, int32_t Width) {
|
||||
int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
|
||||
int32_t T = ((GetWarpSize() - Width) << 8) | 0x1f;
|
||||
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
|
||||
}
|
||||
|
||||
|
||||
@@ -102,10 +102,10 @@ EXTERN int __kmpc_get_hardware_num_blocks() {
|
||||
EXTERN int __kmpc_get_hardware_num_threads_in_block() {
|
||||
return __nvvm_read_ptx_sreg_ntid_x();
|
||||
}
|
||||
EXTERN unsigned __kmpc_get_warp_size() { return WARPSIZE; }
|
||||
EXTERN unsigned GetWarpId() {
|
||||
return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
|
||||
}
|
||||
EXTERN unsigned GetWarpSize() { return WARPSIZE; }
|
||||
EXTERN unsigned GetLaneId() {
|
||||
return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user