On processors supporting vector registers and SIMD instructions, enable i128 as legal type in VRs. This allows many operations to be implemented via native instructions directly in VRs (including add, subtract, logical operations and shifts). For a few other operations (e.g. multiply and divide, as well as atomic operations), we need to move the i128 value back to a GPR pair to use the corresponding instruction there. Overall, this is still beneficial. The patch includes the following LLVM changes: - Enable i128 as legal type - Set up legal operations (in SystemZInstrVector.td) - Custom expansion for i128 add/subtract with carry - Custom expansion for i128 comparisons and selects - Support for moving i128 to/from GPR pairs when required - Handle 128-bit integer constant values everywhere - Use i128 as intrinsic operand type where appropriate - Updated and new test cases In addition, clang builtins are updated to reflect the intrinsic operand type changes (which also improves compatibility with GCC).
138 lines
6.6 KiB
C
138 lines
6.6 KiB
C
// REQUIRES: systemz-registered-target
|
|
// RUN: %clang_cc1 -target-cpu z14 -triple s390x-ibm-linux -flax-vector-conversions=none \
|
|
// RUN: -Wall -Wno-unused -Werror -emit-llvm %s -o - | FileCheck %s
|
|
|
|
typedef __attribute__((vector_size(16))) signed char vec_schar;
|
|
typedef __attribute__((vector_size(16))) signed short vec_sshort;
|
|
typedef __attribute__((vector_size(16))) signed int vec_sint;
|
|
typedef __attribute__((vector_size(16))) signed long long vec_slong;
|
|
typedef __attribute__((vector_size(16))) unsigned char vec_uchar;
|
|
typedef __attribute__((vector_size(16))) unsigned short vec_ushort;
|
|
typedef __attribute__((vector_size(16))) unsigned int vec_uint;
|
|
typedef __attribute__((vector_size(16))) unsigned long long vec_ulong;
|
|
typedef __attribute__((vector_size(16))) double vec_double;
|
|
typedef __attribute__((vector_size(16))) float vec_float;
|
|
|
|
volatile vec_schar vsc;
|
|
volatile vec_sshort vss;
|
|
volatile vec_sint vsi;
|
|
volatile vec_slong vsl;
|
|
volatile vec_uchar vuc;
|
|
volatile vec_ushort vus;
|
|
volatile vec_uint vui;
|
|
volatile vec_ulong vul;
|
|
volatile vec_double vd;
|
|
volatile vec_float vf;
|
|
volatile unsigned __int128 ui128;
|
|
|
|
volatile unsigned int len;
|
|
const void * volatile cptr;
|
|
void * volatile ptr;
|
|
int cc;
|
|
|
|
void test_core(void) {
|
|
vul = __builtin_s390_vbperm(vuc, vuc);
|
|
// CHECK: call <2 x i64> @llvm.s390.vbperm(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
|
|
|
|
vsc = __builtin_s390_vlrlr(len, cptr);
|
|
// CHECK: call <16 x i8> @llvm.s390.vlrl(i32 %{{.*}}, ptr %{{.*}})
|
|
|
|
__builtin_s390_vstrlr(vsc, len, ptr);
|
|
// CHECK: call void @llvm.s390.vstrl(<16 x i8> %{{.*}}, i32 %{{.*}}, ptr %{{.*}})
|
|
}
|
|
|
|
void test_integer(void) {
|
|
ui128 = __builtin_s390_vmslg(vul, vul, ui128, 0);
|
|
// CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 0)
|
|
ui128 = __builtin_s390_vmslg(vul, vul, ui128, 15);
|
|
// CHECK: call i128 @llvm.s390.vmslg(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}, i128 %{{.*}}, i32 15)
|
|
}
|
|
|
|
void test_float(void) {
|
|
vd = __builtin_s390_vfmaxdb(vd, vd, 4);
|
|
// CHECK: call <2 x double> @llvm.maxnum.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}})
|
|
vd = __builtin_s390_vfmaxdb(vd, vd, 0);
|
|
// CHECK: call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
|
|
vd = __builtin_s390_vfmaxdb(vd, vd, 15);
|
|
// CHECK: call <2 x double> @llvm.s390.vfmaxdb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 15)
|
|
|
|
vd = __builtin_s390_vfmindb(vd, vd, 4);
|
|
// CHECK: call <2 x double> @llvm.minnum.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}})
|
|
vd = __builtin_s390_vfmindb(vd, vd, 0);
|
|
// CHECK: call <2 x double> @llvm.s390.vfmindb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 0)
|
|
vd = __builtin_s390_vfmindb(vd, vd, 15);
|
|
// CHECK: call <2 x double> @llvm.s390.vfmindb(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 15)
|
|
|
|
vd = __builtin_s390_vfnmadb(vd, vd, vd);
|
|
// CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}})
|
|
// CHECK: fneg <2 x double> [[RES]]
|
|
vd = __builtin_s390_vfnmsdb(vd, vd, vd);
|
|
// CHECK: [[NEG:%[^ ]+]] = fneg <2 x double> %{{.*}}
|
|
// CHECK: [[RES:%[^ ]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> [[NEG]])
|
|
// CHECK: fneg <2 x double> [[RES]]
|
|
|
|
vsi = __builtin_s390_vfcesbs(vf, vf, &cc);
|
|
// CHECK: call { <4 x i32>, i32 } @llvm.s390.vfcesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
vsi = __builtin_s390_vfchsbs(vf, vf, &cc);
|
|
// CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchsbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
vsi = __builtin_s390_vfchesbs(vf, vf, &cc);
|
|
// CHECK: call { <4 x i32>, i32 } @llvm.s390.vfchesbs(<4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
|
|
vsi = __builtin_s390_vftcisb(vf, 0, &cc);
|
|
// CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 0)
|
|
vsi = __builtin_s390_vftcisb(vf, 4095, &cc);
|
|
// CHECK: call { <4 x i32>, i32 } @llvm.s390.vftcisb(<4 x float> %{{.*}}, i32 4095)
|
|
|
|
vf = __builtin_s390_vfmaxsb(vf, vf, 4);
|
|
// CHECK: call <4 x float> @llvm.maxnum.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfmaxsb(vf, vf, 0);
|
|
// CHECK: call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 0)
|
|
vf = __builtin_s390_vfmaxsb(vf, vf, 15);
|
|
// CHECK: call <4 x float> @llvm.s390.vfmaxsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 15)
|
|
|
|
vf = __builtin_s390_vfminsb(vf, vf, 4);
|
|
// CHECK: call <4 x float> @llvm.minnum.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfminsb(vf, vf, 0);
|
|
// CHECK: call <4 x float> @llvm.s390.vfminsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 0)
|
|
vf = __builtin_s390_vfminsb(vf, vf, 15);
|
|
// CHECK: call <4 x float> @llvm.s390.vfminsb(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 15)
|
|
|
|
vf = __builtin_s390_vfsqsb(vf);
|
|
// CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.*}})
|
|
|
|
vf = __builtin_s390_vfmasb(vf, vf, vf);
|
|
// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfmssb(vf, vf, vf);
|
|
// CHECK: [[NEG:%[^ ]+]] = fneg <4 x float> %{{.*}}
|
|
// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
|
|
vf = __builtin_s390_vfnmasb(vf, vf, vf);
|
|
// CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}})
|
|
// CHECK: fneg <4 x float> [[RES]]
|
|
vf = __builtin_s390_vfnmssb(vf, vf, vf);
|
|
// CHECK: [[NEG:%[^ ]+]] = fneg <4 x float> %{{.*}}
|
|
// CHECK: [[RES:%[^ ]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> [[NEG]])
|
|
// CHECK: fneg <4 x float> [[RES]]
|
|
|
|
vf = __builtin_s390_vflpsb(vf);
|
|
// CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vflnsb(vf);
|
|
// CHECK: [[ABS:%[^ ]+]] = call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{.*}})
|
|
// CHECK: fneg <4 x float> [[ABS]]
|
|
|
|
vf = __builtin_s390_vfisb(vf, 0, 0);
|
|
// CHECK: call <4 x float> @llvm.rint.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfisb(vf, 4, 0);
|
|
// CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfisb(vf, 4, 1);
|
|
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfisb(vf, 4, 5);
|
|
// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfisb(vf, 4, 6);
|
|
// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfisb(vf, 4, 7);
|
|
// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
|
|
vf = __builtin_s390_vfisb(vf, 4, 4);
|
|
// CHECK: call <4 x float> @llvm.s390.vfisb(<4 x float> %{{.*}}, i32 4, i32 4)
|
|
}
|
|
|