Files
clang-p2996/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
Matt Arsenault e0c1f9e76d AMDGPU: Partially fix default device for HSA
There are a few different issues, mostly stemming from using
generation based checks for anything instead of subtarget
features. Stop adding flat-address-space as a feature for HSA, as it
should only be a device property. This was incorrectly allowing flat
instructions to select for SI.

Increase the default generation for HSA to avoid the encoding error
when emitting objects. This has some other side effects from various
checks which probably should be separate subtarget features (in the
cost model and for dealing with the DS offset folding issue).

Partial fix for bug 41070. It should probably be an error to try using
amdhsa without flat support.

llvm-svn: 356347
2019-03-17 21:31:35 +00:00

178 lines
8.3 KiB
LLVM

; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,CISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIFASTF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SISLOWF64,NOFP32DENORM,NOFP16,NOFP16-NOFP32DENORM %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,SLOWFP32DENORMS,NOFP16,NOFP16-FP32DENORM %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck -check-prefixes=ALL,FP32DENORMS,FASTFP32DENORMS,FP16 %s
; ALL: 'fdiv_f32'
; NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv float
; FP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
%vec = load float, float addrspace(1)* %vaddr
%add = fdiv float %vec, %b
store float %add, float addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v2f32'
; NOFP32DENORM: estimated cost of 24 for {{.*}} fdiv <2 x float>
; FP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%add = fdiv <2 x float> %vec, %b
store <2 x float> %add, <2 x float> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v3f32'
; Allow for 48/40 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 36/30 when it is legal.
; NOFP32DENORM: estimated cost of {{36|48}} for {{.*}} fdiv <3 x float>
; FP32DENORMS: estimated cost of {{30|40}} for {{.*}} fdiv <3 x float>
define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fdiv <3 x float> %vec, %b
store <3 x float> %add, <3 x float> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v5f32'
; Allow for 96/80 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 60/50 when it is legal.
; NOFP32DENORM: estimated cost of {{96|60}} for {{.*}} fdiv <5 x float>
; FP32DENORMS: estimated cost of {{80|50}} for {{.*}} fdiv <5 x float>
define amdgpu_kernel void @fdiv_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fdiv <5 x float> %vec, %b
store <5 x float> %add, <5 x float> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_f64'
; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fdiv double %vec, %b
store double %add, double addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v2f64'
; CIFASTF64: estimated cost of 58 for {{.*}} fdiv <2 x double>
; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%add = fdiv <2 x double> %vec, %b
store <2 x double> %add, <2 x double> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v3f64'
; CIFASTF64: estimated cost of 87 for {{.*}} fdiv <3 x double>
; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%add = fdiv <3 x double> %vec, %b
store <3 x double> %add, <3 x double> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_f16'
; NOFP16-NOFP32DENORM: estimated cost of 12 for {{.*}} fdiv half
; NOFP16-FP32DENORM: estimated cost of 10 for {{.*}} fdiv half
; FP16: estimated cost of 10 for {{.*}} fdiv half
define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
%vec = load half, half addrspace(1)* %vaddr
%add = fdiv half %vec, %b
store half %add, half addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v2f16'
; NOFP16-NOFP32DENORM: estimated cost of 24 for {{.*}} fdiv <2 x half>
; NOFP16-FP32DENORM: estimated cost of 20 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 20 for {{.*}} fdiv <2 x half>
define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%add = fdiv <2 x half> %vec, %b
store <2 x half> %add, <2 x half> addrspace(1)* %out
ret void
}
; ALL: 'fdiv_v4f16'
; NOFP16-NOFP32DENORM: estimated cost of 48 for {{.*}} fdiv <4 x half>
; NOFP16-FP32DENORM: estimated cost of 40 for {{.*}} fdiv <4 x half>
; FP16: estimated cost of 40 for {{.*}} fdiv <4 x half>
define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
%add = fdiv <4 x half> %vec, %b
store <4 x half> %add, <4 x half> addrspace(1)* %out
ret void
}
; ALL: 'rcp_f32'
; NOFP32DENORM: estimated cost of 3 for {{.*}} fdiv float
; SLOWFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
; FASTFP32DENORMS: estimated cost of 10 for {{.*}} fdiv float
define amdgpu_kernel void @rcp_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
%vec = load float, float addrspace(1)* %vaddr
%add = fdiv float 1.0, %vec
store float %add, float addrspace(1)* %out
ret void
}
; ALL: 'rcp_f16'
; NOFP16-NOFP32DENORM: estimated cost of 3 for {{.*}} fdiv half
; NOFP16-FP32DENORM: estimated cost of 10 for {{.*}} fdiv half
; FP16: estimated cost of 3 for {{.*}} fdiv half
define amdgpu_kernel void @rcp_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
%vec = load half, half addrspace(1)* %vaddr
%add = fdiv half 1.0, %vec
store half %add, half addrspace(1)* %out
ret void
}
; ALL: 'rcp_f64'
; CIFASTF64: estimated cost of 29 for {{.*}} fdiv double
; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fdiv double 1.0, %vec
store double %add, double addrspace(1)* %out
ret void
}
; ALL: 'rcp_v2f32'
; NOFP32DENORM: estimated cost of 6 for {{.*}} fdiv <2 x float>
; SLOWFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
; FASTFP32DENORMS: estimated cost of 20 for {{.*}} fdiv <2 x float>
define amdgpu_kernel void @rcp_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
%add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
store <2 x float> %add, <2 x float> addrspace(1)* %out
ret void
}
; ALL: 'rcp_v2f16'
; NOFP16-NOFP32DENORM: estimated cost of 6 for {{.*}} fdiv <2 x half>
; NOFP16-FP32DENORM: estimated cost of 20 for {{.*}} fdiv <2 x half>
; FP16: estimated cost of 6 for {{.*}} fdiv <2 x half>
define amdgpu_kernel void @rcp_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
%add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
store <2 x half> %add, <2 x half> addrspace(1)* %out
ret void
}
attributes #0 = { nounwind }