Files
clang-p2996/clang/test/CodeGenCUDA/link-builtin-bitcode-denormal-fp-mode.cu
pvanhout 23431b5246 [clang][CodeGen] Fix GPU-specific attributes being dropped by bitcode linking
Device libs make use of patterns like this:
```
__attribute__((target("gfx11-insts")))
static unsigned do_intrin_stuff(void)
{
  return __builtin_amdgcn_s_sendmsg_rtnl(0x0);
}
```
For functions that are assumed to be eliminated if the currennt GPU target doesn't support them.
At O0 such functions aren't eliminated by common optimizations but often by AMDGPURemoveIncompatibleFunctions instead, which sees the "+gfx11-insts" attribute on, say, GFX9 and knows it's not valid, so it removes the function.

D142907 accidentally made it so such attributes were dropped during bitcode linking, making it impossible for RemoveIncompatibleFunctions to catch the functions and causing ISel to catch fire eventually.

This fixes the issue and adds a new test to ensure we don't accidentally fall into this trap again.

Fixes SWDEV-403642

Reviewed By: arsenm, yaxunl

Differential Revision: https://reviews.llvm.org/D152251
2023-06-07 15:51:52 +02:00

166 lines
8.2 KiB
Plaintext

// Verify the behavior of the denormal-fp-mode attributes in the way that
// rocm-device-libs should be built with. The bitcode should be compiled with
// denormal-fp-math-f32=dynamic, and should be replaced with the denormal mode
// of the final TU.
// Build the fake device library in the way rocm-device-libs should be built.
//
// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa -fdenormal-fp-math-f32=dynamic \
// RUN: -mcode-object-version=none -emit-llvm-bc \
// RUN: %S/Inputs/ocml-sample.cl -o %t.dynamic.f32.bc
//
// RUN: %clang_cc1 -x cl -triple amdgcn-amd-amdhsa -fdenormal-fp-math=dynamic \
// RUN: -mcode-object-version=none -emit-llvm-bc \
// RUN: %S/Inputs/ocml-sample.cl -o %t.dynamic.full.bc
// Check the default behavior with no denormal-fp-math arguments.
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \
// RUN: -mlink-builtin-bitcode %t.dynamic.f32.bc \
// RUN: -emit-llvm %s -o - | FileCheck -implicit-check-not=denormal-fp-math %s --check-prefixes=CHECK,INTERNALIZE
// Check an explicit full ieee request
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 -fcuda-is-device \
// RUN: -fdenormal-fp-math=ieee \
// RUN: -mlink-builtin-bitcode %t.dynamic.f32.bc \
// RUN: -emit-llvm %s -o - | FileCheck -implicit-check-not=denormal-fp-math %s --check-prefixes=CHECK,INTERNALIZE
// Check explicit f32-only flushing request
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -fdenormal-fp-math-f32=preserve-sign \
// RUN: -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF64-PSZF32
// Check explicit flush all request. Only the f32 component of the library is
// dynamic, so the linked functions should use IEEE as the base mode and the new
// functions preserve-sign.
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -fdenormal-fp-math=preserve-sign \
// RUN: -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,PSZ
// Check explicit f32-only, ieee-other flushing request
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -fdenormal-fp-math=ieee -fdenormal-fp-math-f32=preserve-sign \
// RUN: -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF64-PSZF32
// Check inverse of normal usage. Requesting IEEE f32, with flushed f16/f64
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee \
// RUN: -mlink-builtin-bitcode %t.dynamic.f32.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF32-PSZF64-DYNF32
// Check backwards from the normal usage where both library components can be
// overridden.
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee \
// RUN: -mlink-builtin-bitcode %t.dynamic.full.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,INTERNALIZE,IEEEF32-PSZF64-DYNFULL
// Check the case where no internalization is performed
// RUN: %clang_cc1 -x hip -triple amdgcn-amd-amdhsa -target-cpu gfx803 \
// RUN: -fcuda-is-device -fdenormal-fp-math=preserve-sign -fdenormal-fp-math-f32=ieee \
// RUN: -mlink-bitcode-file %t.dynamic.full.bc -emit-llvm %s -o - \
// RUN: | FileCheck -implicit-check-not=denormal-fp-math --enable-var-scope %s --check-prefixes=CHECK,NOINTERNALIZE,NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL
#define __device__ __attribute__((device))
#define __global__ __attribute__((global))
typedef _Float16 half;
extern "C" {
__device__ half do_f16_stuff(half a, half b, half c);
__device__ float do_f32_stuff(float a, float b, float c);
// Currently all library functions are internalized. Check a weak function in
// case we ever choose to not internalize these. In that case, the safest thing
// to do would likely be to preserve the dynamic denormal-fp-math.
__attribute__((weak)) __device__ float weak_do_f32_stuff(float a, float b, float c);
__device__ double do_f64_stuff(double a, double b, double c);
// CHECK: kernel_f16({{.*}}) #[[$KERNELATTR:[0-9]+]]
__global__ void kernel_f16(float* out, float* a, float* b, float* c) {
int id = 0;
out[id] = do_f16_stuff(a[id], b[id], c[id]);
}
// CHECK: kernel_f32({{.*}}) #[[$KERNELATTR]]
__global__ void kernel_f32(float* out, float* a, float* b, float* c) {
int id = 0;
out[id] = do_f32_stuff(a[id], b[id], c[id]);
out[id] += weak_do_f32_stuff(a[id], b[id], c[id]);
}
// CHECK: kernel_f64({{.*}}) #[[$KERNELATTR]]
__global__ void kernel_f64(double* out, double* a, double* b, double* c) {
int id = 0;
out[id] = do_f64_stuff(a[id], b[id], c[id]);
}
}
// INTERNALIZE: define internal half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
// INTERNALIZE: define internal float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
// INTERNALIZE: define internal double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
// INTERNALIZE: define internal float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]
// NOINTERNALIZE: define dso_local half @do_f16_stuff({{.*}}) #[[$FUNCATTR:[0-9]+]]
// NOINTERNALIZE: define dso_local float @do_f32_stuff({{.*}}) #[[$FUNCATTR]]
// NOINTERNALIZE: define dso_local double @do_f64_stuff({{.*}}) #[[$FUNCATTR]]
// NOINTERNALIZE: define weak float @weak_do_f32_stuff({{.*}}) #[[$WEAK_FUNCATTR:[0-9]+]]
// We should not be littering call sites with the attribute
// Everything should use the default ieee with no explicit attribute
// FIXME: Should check-not "denormal-fp-math" within the denormal-fp-math-f32
// lines.
// Default mode relies on the implicit check-not for the denormal-fp-math.
// PSZ: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// PSZ: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// PSZ: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// PSZ-SAME: "target-cpu"="gfx803"
// FIXME: Should check-not "denormal-fp-math" within the line
// IEEEF64-PSZF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF64-PSZF32: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF64-PSZF32: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math-f32"="preserve-sign,preserve-sign"
// IEEEF64-PSZF32-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNF32: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
// implicit check-not
// implicit check-not
// IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee"
// IEEEF32-PSZF64-DYNFULL-SAME: "target-cpu"="gfx803"
// -mlink-bitcode-file doesn't internalize or propagate attributes.
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$KERNELATTR]] = { {{.*}} "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" {{.*}} "target-cpu"="gfx803" {{.*}} }
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$FUNCATTR]] = { {{.*}} "denormal-fp-math"="dynamic,dynamic" {{.*}} }
// NOINTERNALIZE-IEEEF32-PSZF64-DYNFULL: #[[$WEAK_FUNCATTR]] = { {{.*}} "denormal-fp-math"="dynamic,dynamic" {{.*}} }