[NVPTX] use untyped loads and stores where ever possible (#137698)
In most cases, the type information attached to load and store instructions is meaningless and inconsistently applied. We can usually use ".b" loads and avoid the complexity of trying to assign the correct type. The one expectation is sign-extending load, which will continue to use ".s" to ensure the sign extension into a larger register is done correctly.
This commit is contained in:
@@ -11,7 +11,7 @@
|
||||
// CHECK: .param .align 2 .b8 _Z8test_argPDF16bDF16b_param_1[2]
|
||||
//
|
||||
__device__ void test_arg(__bf16 *out, __bf16 in) {
|
||||
// CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0];
|
||||
// CHECK-DAG: ld.param.b64 %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0];
|
||||
// CHECK-DAG: ld.param.b16 %[[R:rs[0-9]+]], [_Z8test_argPDF16bDF16b_param_1];
|
||||
__bf16 bf16 = in;
|
||||
*out = bf16;
|
||||
|
||||
@@ -179,26 +179,26 @@
|
||||
__host__ __device__ float func(float a, float b, float c) { return a + b * c; }
|
||||
// COMMON-LABEL: _Z4funcfff
|
||||
// NV-ON: fma.rn.f32
|
||||
// NV-ON-NEXT: st.param.f32
|
||||
// NV-ON-NEXT: st.param.b32
|
||||
// AMD-ON: v_fmac_f32_e64
|
||||
// AMD-ON-NEXT: s_setpc_b64
|
||||
|
||||
// NV-OFF: mul.rn.f32
|
||||
// NV-OFF-NEXT: add.rn.f32
|
||||
// NV-OFF-NEXT: st.param.f32
|
||||
// NV-OFF-NEXT: st.param.b32
|
||||
// AMD-OFF: v_mul_f32_e64
|
||||
// AMD-OFF-NEXT: v_add_f32_e64
|
||||
// AMD-OFF-NEXT: s_setpc_b64
|
||||
|
||||
// NV-OPT-FAST: fma.rn.f32
|
||||
// NV-OPT-FAST-NEXT: st.param.f32
|
||||
// NV-OPT-FAST-NEXT: st.param.b32
|
||||
// NV-OPT-FASTSTD: fma.rn.f32
|
||||
// NV-OPT-FASTSTD-NEXT: st.param.f32
|
||||
// NV-OPT-FASTSTD-NEXT: st.param.b32
|
||||
// NV-OPT-ON: fma.rn.f32
|
||||
// NV-OPT-ON-NEXT: st.param.f32
|
||||
// NV-OPT-ON-NEXT: st.param.b32
|
||||
// NV-OPT-OFF: mul.rn.f32
|
||||
// NV-OPT-OFF-NEXT: add.rn.f32
|
||||
// NV-OPT-OFF-NEXT: st.param.f32
|
||||
// NV-OPT-OFF-NEXT: st.param.b32
|
||||
|
||||
// AMD-OPT-FAST-IR: fmul contract float
|
||||
// AMD-OPT-FAST-IR: fadd contract float
|
||||
@@ -224,15 +224,15 @@ __host__ __device__ float func2(float a, float b, float c) {
|
||||
}
|
||||
// COMMON-LABEL: _Z5func2fff
|
||||
// NV-OPT-FAST: fma.rn.f32
|
||||
// NV-OPT-FAST-NEXT: st.param.f32
|
||||
// NV-OPT-FAST-NEXT: st.param.b32
|
||||
// NV-OPT-FASTSTD: fma.rn.f32
|
||||
// NV-OPT-FASTSTD-NEXT: st.param.f32
|
||||
// NV-OPT-FASTSTD-NEXT: st.param.b32
|
||||
// NV-OPT-ON: mul.rn.f32
|
||||
// NV-OPT-ON: add.rn.f32
|
||||
// NV-OPT-ON-NEXT: st.param.f32
|
||||
// NV-OPT-ON-NEXT: st.param.b32
|
||||
// NV-OPT-OFF: mul.rn.f32
|
||||
// NV-OPT-OFF: add.rn.f32
|
||||
// NV-OPT-OFF-NEXT: st.param.f32
|
||||
// NV-OPT-OFF-NEXT: st.param.b32
|
||||
|
||||
// AMD-OPT-FAST-IR: fmul contract float
|
||||
// AMD-OPT-FAST-IR: fadd contract float
|
||||
@@ -267,16 +267,16 @@ __host__ __device__ float func2(float a, float b, float c) {
|
||||
}
|
||||
// COMMON-LABEL: _Z5func3fff
|
||||
// NV-OPT-FAST: fma.rn.f32
|
||||
// NV-OPT-FAST-NEXT: st.param.f32
|
||||
// NV-OPT-FAST-NEXT: st.param.b32
|
||||
// NV-OPT-FASTSTD: mul.rn.f32
|
||||
// NV-OPT-FASTSTD: add.rn.f32
|
||||
// NV-OPT-FASTSTD-NEXT: st.param.f32
|
||||
// NV-OPT-FASTSTD-NEXT: st.param.b32
|
||||
// NV-OPT-ON: mul.rn.f32
|
||||
// NV-OPT-ON: add.rn.f32
|
||||
// NV-OPT-ON-NEXT: st.param.f32
|
||||
// NV-OPT-ON-NEXT: st.param.b32
|
||||
// NV-OPT-OFF: mul.rn.f32
|
||||
// NV-OPT-OFF: add.rn.f32
|
||||
// NV-OPT-OFF-NEXT: st.param.f32
|
||||
// NV-OPT-OFF-NEXT: st.param.b32
|
||||
|
||||
// AMD-OPT-FAST-IR: fmul float
|
||||
// AMD-OPT-FAST-IR: fadd float
|
||||
|
||||
@@ -10,15 +10,15 @@
|
||||
// PTX-LABEL: .func _Z12copy_genericPvPKv(
|
||||
void __device__ copy_generic(void *dest, const void *src) {
|
||||
__builtin_memcpy(dest, src, 32);
|
||||
// PTX: ld.u8
|
||||
// PTX: st.u8
|
||||
// PTX: ld.b8
|
||||
// PTX: st.b8
|
||||
}
|
||||
|
||||
// PTX-LABEL: .entry _Z11copy_globalPvS_(
|
||||
void __global__ copy_global(void *dest, void * src) {
|
||||
__builtin_memcpy(dest, src, 32);
|
||||
// PTX: ld.global.u8
|
||||
// PTX: st.global.u8
|
||||
// PTX: ld.global.b8
|
||||
// PTX: st.global.b8
|
||||
}
|
||||
|
||||
struct S {
|
||||
@@ -28,24 +28,24 @@ struct S {
|
||||
// PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_(
|
||||
void __global__ copy_param_to_global(S *global, S param) {
|
||||
__builtin_memcpy(global, ¶m, sizeof(S));
|
||||
// PTX: ld.param.u32
|
||||
// PTX: st.global.u32
|
||||
// PTX: ld.param.b32
|
||||
// PTX: st.global.b32
|
||||
}
|
||||
|
||||
// PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_(
|
||||
void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local,
|
||||
S param) {
|
||||
__builtin_memcpy(local, ¶m, sizeof(S));
|
||||
// PTX: ld.param.u32
|
||||
// PTX: st.local.u32
|
||||
// PTX: ld.param.b32
|
||||
// PTX: st.local.b32
|
||||
}
|
||||
|
||||
// PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_(
|
||||
void __device__ copy_local_to_generic(S *generic,
|
||||
__attribute__((address_space(5))) S *src) {
|
||||
__builtin_memcpy(generic, src, sizeof(S));
|
||||
// PTX: ld.local.u32
|
||||
// PTX: st.u32
|
||||
// PTX: ld.local.b32
|
||||
// PTX: st.b32
|
||||
}
|
||||
|
||||
__shared__ S shared;
|
||||
@@ -53,12 +53,12 @@ __shared__ S shared;
|
||||
// PTX-LABEL: .entry _Z20copy_param_to_shared1S(
|
||||
void __global__ copy_param_to_shared( S param) {
|
||||
__builtin_memcpy(&shared, ¶m, sizeof(S));
|
||||
// PTX: ld.param.u32
|
||||
// PTX: st.shared.u32
|
||||
// PTX: ld.param.b32
|
||||
// PTX: st.shared.b32
|
||||
}
|
||||
|
||||
void __device__ copy_shared_to_generic(S *generic) {
|
||||
__builtin_memcpy(generic, &shared, sizeof(S));
|
||||
// PTX: ld.shared.u32
|
||||
// PTX: st.u32
|
||||
// PTX: ld.shared.b32
|
||||
// PTX: st.b32
|
||||
}
|
||||
|
||||
@@ -1044,21 +1044,6 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8,
|
||||
}
|
||||
}
|
||||
|
||||
static int getLdStRegType(EVT VT) {
|
||||
if (VT.isFloatingPoint())
|
||||
switch (VT.getSimpleVT().SimpleTy) {
|
||||
case MVT::f16:
|
||||
case MVT::bf16:
|
||||
case MVT::v2f16:
|
||||
case MVT::v2bf16:
|
||||
return NVPTX::PTXLdStInstCode::Untyped;
|
||||
default:
|
||||
return NVPTX::PTXLdStInstCode::Float;
|
||||
}
|
||||
else
|
||||
return NVPTX::PTXLdStInstCode::Unsigned;
|
||||
}
|
||||
|
||||
bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
MemSDNode *LD = cast<MemSDNode>(N);
|
||||
assert(LD->readMem() && "Expected load");
|
||||
@@ -1088,24 +1073,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
// type is integer
|
||||
// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
|
||||
MVT SimpleVT = LoadedVT.getSimpleVT();
|
||||
MVT ScalarVT = SimpleVT.getScalarType();
|
||||
// Read at least 8 bits (predicates are stored as 8-bit values)
|
||||
unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
|
||||
unsigned int FromType;
|
||||
unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits());
|
||||
|
||||
// Vector Setting
|
||||
unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
|
||||
if (SimpleVT.isVector()) {
|
||||
assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) &&
|
||||
"Unexpected vector type");
|
||||
// v2f16/v2bf16/v2i16 is loaded using ld.b32
|
||||
FromTypeWidth = 32;
|
||||
}
|
||||
|
||||
if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
|
||||
FromType = NVPTX::PTXLdStInstCode::Signed;
|
||||
else
|
||||
FromType = getLdStRegType(ScalarVT);
|
||||
unsigned int FromType =
|
||||
(PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
|
||||
? NVPTX::PTXLdStInstCode::Signed
|
||||
: NVPTX::PTXLdStInstCode::Untyped;
|
||||
|
||||
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
|
||||
FromTypeWidth <= 128 && "Invalid width for load");
|
||||
@@ -1116,7 +1091,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
SDValue Ops[] = {getI32Imm(Ordering, DL),
|
||||
getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL),
|
||||
getI32Imm(VecType, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
|
||||
getI32Imm(FromType, DL),
|
||||
getI32Imm(FromTypeWidth, DL),
|
||||
Base,
|
||||
@@ -1182,7 +1157,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1);
|
||||
unsigned FromType = (ExtensionType == ISD::SEXTLOAD)
|
||||
? NVPTX::PTXLdStInstCode::Signed
|
||||
: getLdStRegType(MemVT.getScalarType());
|
||||
: NVPTX::PTXLdStInstCode::Untyped;
|
||||
|
||||
unsigned VecType;
|
||||
unsigned FromTypeWidth;
|
||||
@@ -1200,8 +1175,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
}
|
||||
|
||||
if (isSubVectorPackedInI32(EltVT)) {
|
||||
assert(ExtensionType == ISD::NON_EXTLOAD);
|
||||
EltVT = MVT::i32;
|
||||
FromType = NVPTX::PTXLdStInstCode::Untyped;
|
||||
}
|
||||
|
||||
assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 &&
|
||||
@@ -1405,21 +1380,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST);
|
||||
|
||||
// Vector Setting
|
||||
MVT SimpleVT = StoreVT.getSimpleVT();
|
||||
unsigned VecType = NVPTX::PTXLdStInstCode::Scalar;
|
||||
|
||||
// Type Setting: toType + toTypeWidth
|
||||
// - for integer type, always use 'u'
|
||||
MVT ScalarVT = SimpleVT.getScalarType();
|
||||
unsigned ToTypeWidth = ScalarVT.getSizeInBits();
|
||||
if (SimpleVT.isVector()) {
|
||||
assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) &&
|
||||
"Unexpected vector type");
|
||||
// v2x16 is stored using st.b32
|
||||
ToTypeWidth = 32;
|
||||
}
|
||||
|
||||
unsigned int ToType = getLdStRegType(ScalarVT);
|
||||
const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits();
|
||||
|
||||
// Create the machine instruction DAG
|
||||
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
|
||||
@@ -1434,8 +1395,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
getI32Imm(Ordering, DL),
|
||||
getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL),
|
||||
getI32Imm(VecType, DL),
|
||||
getI32Imm(ToType, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL),
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
|
||||
getI32Imm(ToTypeWidth, DL),
|
||||
Base,
|
||||
Offset,
|
||||
@@ -1481,7 +1442,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
// Type Setting: toType + toTypeWidth
|
||||
// - for integer type, always use 'u'
|
||||
const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits();
|
||||
unsigned ToType = getLdStRegType(StoreVT.getSimpleVT().getScalarType());
|
||||
|
||||
SmallVector<SDValue, 12> Ops;
|
||||
SDValue N2;
|
||||
@@ -1508,7 +1468,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
|
||||
if (isSubVectorPackedInI32(EltVT)) {
|
||||
EltVT = MVT::i32;
|
||||
ToType = NVPTX::PTXLdStInstCode::Untyped;
|
||||
}
|
||||
|
||||
assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 &&
|
||||
@@ -1519,8 +1478,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
|
||||
Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL),
|
||||
getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL),
|
||||
getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL), Base, Offset,
|
||||
Chain});
|
||||
getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL),
|
||||
getI32Imm(ToTypeWidth, DL), Base, Offset, Chain});
|
||||
|
||||
std::optional<unsigned> Opcode;
|
||||
switch (N->getOpcode()) {
|
||||
|
||||
@@ -2249,11 +2249,11 @@ def LoadParamMemV2I8 : LoadParamV2MemInst<Int16Regs, ".b8">;
|
||||
def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
|
||||
def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
|
||||
def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
|
||||
def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
|
||||
def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
|
||||
def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
|
||||
def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
|
||||
def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
|
||||
def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".b32">;
|
||||
def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".b64">;
|
||||
def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".b32">;
|
||||
def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".b64">;
|
||||
def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".b32">;
|
||||
|
||||
defm StoreParamI64 : StoreParamInst<Int64Regs, i64imm, ".b64">;
|
||||
defm StoreParamI32 : StoreParamInst<Int32Regs, i32imm, ".b32">;
|
||||
@@ -2272,13 +2272,13 @@ defm StoreParamV4I32 : StoreParamV4Inst<Int32Regs, i32imm, ".b32">;
|
||||
defm StoreParamV4I16 : StoreParamV4Inst<Int16Regs, i16imm, ".b16">;
|
||||
defm StoreParamV4I8 : StoreParamV4Inst<Int16Regs, i8imm, ".b8">;
|
||||
|
||||
defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".f32">;
|
||||
defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".f64">;
|
||||
defm StoreParamF32 : StoreParamInst<Float32Regs, f32imm, ".b32">;
|
||||
defm StoreParamF64 : StoreParamInst<Float64Regs, f64imm, ".b64">;
|
||||
|
||||
defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".f32">;
|
||||
defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".f64">;
|
||||
defm StoreParamV2F32 : StoreParamV2Inst<Float32Regs, f32imm, ".b32">;
|
||||
defm StoreParamV2F64 : StoreParamV2Inst<Float64Regs, f64imm, ".b64">;
|
||||
|
||||
defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".f32">;
|
||||
defm StoreParamV4F32 : StoreParamV4Inst<Float32Regs, f32imm, ".b32">;
|
||||
|
||||
def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
|
||||
def StoreRetvalI32 : StoreRetvalInst<Int32Regs, ".b32">;
|
||||
@@ -2294,11 +2294,11 @@ def StoreRetvalV4I32 : StoreRetvalV4Inst<Int32Regs, ".b32">;
|
||||
def StoreRetvalV4I16 : StoreRetvalV4Inst<Int16Regs, ".b16">;
|
||||
def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
|
||||
|
||||
def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
|
||||
def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
|
||||
def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
|
||||
def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
|
||||
def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
|
||||
def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".b64">;
|
||||
def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".b32">;
|
||||
def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".b64">;
|
||||
def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".b32">;
|
||||
def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".b32">;
|
||||
|
||||
def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
|
||||
def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
|
||||
|
||||
@@ -2329,12 +2329,12 @@ class LDU_G<string TyStr, NVPTXRegClass regclass>
|
||||
"ldu.global." # TyStr # " \t$result, [$src];",
|
||||
[]>, Requires<[hasLDU]>;
|
||||
|
||||
def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>;
|
||||
def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>;
|
||||
|
||||
// vector
|
||||
|
||||
@@ -2351,19 +2351,19 @@ class VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass>
|
||||
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
|
||||
|
||||
|
||||
def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
|
||||
def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
|
||||
def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
|
||||
def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
|
||||
def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>;
|
||||
def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>;
|
||||
def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>;
|
||||
def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>;
|
||||
|
||||
def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
|
||||
def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
|
||||
def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
|
||||
def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
|
||||
def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;
|
||||
def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>;
|
||||
|
||||
|
||||
//-----------------------------------
|
||||
@@ -2379,12 +2379,12 @@ class LDG_G<string TyStr, NVPTXRegClass regclass>
|
||||
"ld.global.nc." # TyStr # " \t$result, [$src];",
|
||||
[]>, Requires<[hasLDG]>;
|
||||
|
||||
def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>;
|
||||
def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>;
|
||||
|
||||
// vector
|
||||
|
||||
@@ -2401,17 +2401,17 @@ class VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> :
|
||||
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
|
||||
|
||||
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
|
||||
def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
|
||||
def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
|
||||
def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
|
||||
def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
|
||||
def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>;
|
||||
def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>;
|
||||
def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>;
|
||||
def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>;
|
||||
|
||||
def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
|
||||
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
|
||||
def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>;
|
||||
def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>;
|
||||
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>;
|
||||
|
||||
|
||||
multiclass NG_TO_G<string Str, bit Supports32 = 1, list<Predicate> Preds = []> {
|
||||
|
||||
@@ -16,8 +16,8 @@ define i32 @f(ptr %p) {
|
||||
; ENABLED-NEXT: .reg .b64 %rd<2>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [f_param_0];
|
||||
; ENABLED-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [f_param_0];
|
||||
; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
|
||||
; ENABLED-NEXT: add.s32 %r3, %r1, %r2;
|
||||
; ENABLED-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; ENABLED-NEXT: ret;
|
||||
@@ -28,9 +28,9 @@ define i32 @f(ptr %p) {
|
||||
; DISABLED-NEXT: .reg .b64 %rd<2>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [f_param_0];
|
||||
; DISABLED-NEXT: ld.u32 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.u32 %r2, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [f_param_0];
|
||||
; DISABLED-NEXT: ld.b32 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.b32 %r2, [%rd1+4];
|
||||
; DISABLED-NEXT: add.s32 %r3, %r1, %r2;
|
||||
; DISABLED-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; DISABLED-NEXT: ret;
|
||||
@@ -49,7 +49,7 @@ define half @fh(ptr %p) {
|
||||
; ENABLED-NEXT: .reg .b64 %rd<2>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [fh_param_0];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
|
||||
; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
|
||||
; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8];
|
||||
; ENABLED-NEXT: cvt.f32.f16 %f1, %rs2;
|
||||
@@ -78,7 +78,7 @@ define half @fh(ptr %p) {
|
||||
; DISABLED-NEXT: .reg .b64 %rd<2>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [fh_param_0];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
|
||||
; DISABLED-NEXT: ld.b16 %rs1, [%rd1];
|
||||
; DISABLED-NEXT: ld.b16 %rs2, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.b16 %rs3, [%rd1+4];
|
||||
@@ -125,14 +125,14 @@ define float @ff(ptr %p) {
|
||||
; ENABLED-NEXT: .reg .b64 %rd<2>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [ff_param_0];
|
||||
; ENABLED-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1];
|
||||
; ENABLED-NEXT: ld.f32 %f5, [%rd1+16];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [ff_param_0];
|
||||
; ENABLED-NEXT: ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1];
|
||||
; ENABLED-NEXT: ld.b32 %f5, [%rd1+16];
|
||||
; ENABLED-NEXT: add.rn.f32 %f6, %f1, %f2;
|
||||
; ENABLED-NEXT: add.rn.f32 %f7, %f3, %f4;
|
||||
; ENABLED-NEXT: add.rn.f32 %f8, %f6, %f7;
|
||||
; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f5;
|
||||
; ENABLED-NEXT: st.param.f32 [func_retval0], %f9;
|
||||
; ENABLED-NEXT: st.param.b32 [func_retval0], %f9;
|
||||
; ENABLED-NEXT: ret;
|
||||
;
|
||||
; DISABLED-LABEL: ff(
|
||||
@@ -141,17 +141,17 @@ define float @ff(ptr %p) {
|
||||
; DISABLED-NEXT: .reg .b64 %rd<2>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [ff_param_0];
|
||||
; DISABLED-NEXT: ld.f32 %f1, [%rd1];
|
||||
; DISABLED-NEXT: ld.f32 %f2, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.f32 %f3, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.f32 %f4, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.f32 %f5, [%rd1+16];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [ff_param_0];
|
||||
; DISABLED-NEXT: ld.b32 %f1, [%rd1];
|
||||
; DISABLED-NEXT: ld.b32 %f2, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.b32 %f3, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.b32 %f4, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.b32 %f5, [%rd1+16];
|
||||
; DISABLED-NEXT: add.rn.f32 %f6, %f1, %f2;
|
||||
; DISABLED-NEXT: add.rn.f32 %f7, %f3, %f4;
|
||||
; DISABLED-NEXT: add.rn.f32 %f8, %f6, %f7;
|
||||
; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f5;
|
||||
; DISABLED-NEXT: st.param.f32 [func_retval0], %f9;
|
||||
; DISABLED-NEXT: st.param.b32 [func_retval0], %f9;
|
||||
; DISABLED-NEXT: ret;
|
||||
%p.1 = getelementptr float, ptr %p, i32 1
|
||||
%p.2 = getelementptr float, ptr %p, i32 2
|
||||
@@ -176,9 +176,9 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; ENABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0];
|
||||
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
|
||||
; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
|
||||
; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
|
||||
; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8;
|
||||
@@ -210,7 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
|
||||
; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
|
||||
; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
|
||||
; ENABLED-NEXT: st.u32 [%rd2], %r35;
|
||||
; ENABLED-NEXT: st.b32 [%rd2], %r35;
|
||||
; ENABLED-NEXT: ret;
|
||||
;
|
||||
; DISABLED-LABEL: combine_v16i8(
|
||||
@@ -219,24 +219,24 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; DISABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0];
|
||||
; DISABLED-NEXT: ld.u8 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1];
|
||||
; DISABLED-NEXT: ld.u8 %r2, [%rd1+1];
|
||||
; DISABLED-NEXT: ld.u8 %r3, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.u8 %r4, [%rd1+3];
|
||||
; DISABLED-NEXT: ld.u8 %r5, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.u8 %r6, [%rd1+5];
|
||||
; DISABLED-NEXT: ld.u8 %r7, [%rd1+6];
|
||||
; DISABLED-NEXT: ld.u8 %r8, [%rd1+7];
|
||||
; DISABLED-NEXT: ld.u8 %r9, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.u8 %r10, [%rd1+9];
|
||||
; DISABLED-NEXT: ld.u8 %r11, [%rd1+10];
|
||||
; DISABLED-NEXT: ld.u8 %r12, [%rd1+11];
|
||||
; DISABLED-NEXT: ld.u8 %r13, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.u8 %r14, [%rd1+13];
|
||||
; DISABLED-NEXT: ld.u8 %r15, [%rd1+14];
|
||||
; DISABLED-NEXT: ld.u8 %r16, [%rd1+15];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0];
|
||||
; DISABLED-NEXT: ld.b8 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1];
|
||||
; DISABLED-NEXT: ld.b8 %r2, [%rd1+1];
|
||||
; DISABLED-NEXT: ld.b8 %r3, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.b8 %r4, [%rd1+3];
|
||||
; DISABLED-NEXT: ld.b8 %r5, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.b8 %r6, [%rd1+5];
|
||||
; DISABLED-NEXT: ld.b8 %r7, [%rd1+6];
|
||||
; DISABLED-NEXT: ld.b8 %r8, [%rd1+7];
|
||||
; DISABLED-NEXT: ld.b8 %r9, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.b8 %r10, [%rd1+9];
|
||||
; DISABLED-NEXT: ld.b8 %r11, [%rd1+10];
|
||||
; DISABLED-NEXT: ld.b8 %r12, [%rd1+11];
|
||||
; DISABLED-NEXT: ld.b8 %r13, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.b8 %r14, [%rd1+13];
|
||||
; DISABLED-NEXT: ld.b8 %r15, [%rd1+14];
|
||||
; DISABLED-NEXT: ld.b8 %r16, [%rd1+15];
|
||||
; DISABLED-NEXT: add.s32 %r17, %r1, %r2;
|
||||
; DISABLED-NEXT: add.s32 %r18, %r17, %r3;
|
||||
; DISABLED-NEXT: add.s32 %r19, %r18, %r4;
|
||||
@@ -252,7 +252,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; DISABLED-NEXT: add.s32 %r29, %r28, %r14;
|
||||
; DISABLED-NEXT: add.s32 %r30, %r29, %r15;
|
||||
; DISABLED-NEXT: add.s32 %r31, %r30, %r16;
|
||||
; DISABLED-NEXT: st.u32 [%rd2], %r31;
|
||||
; DISABLED-NEXT: st.b32 [%rd2], %r31;
|
||||
; DISABLED-NEXT: ret;
|
||||
%val0 = load i8, ptr %ptr1, align 16
|
||||
%ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
|
||||
@@ -327,9 +327,9 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
|
||||
; ENABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0];
|
||||
; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
|
||||
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
|
||||
; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8];
|
||||
; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8;
|
||||
; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8;
|
||||
@@ -362,7 +362,7 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
|
||||
; ENABLED-NEXT: add.s32 %r33, %r32, %r18;
|
||||
; ENABLED-NEXT: add.s32 %r34, %r33, %r19;
|
||||
; ENABLED-NEXT: add.s32 %r35, %r34, %r20;
|
||||
; ENABLED-NEXT: st.u32 [%rd2], %r35;
|
||||
; ENABLED-NEXT: st.b32 [%rd2], %r35;
|
||||
; ENABLED-NEXT: ret;
|
||||
;
|
||||
; DISABLED-LABEL: combine_v16i8_unaligned(
|
||||
@@ -371,24 +371,24 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
|
||||
; DISABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0];
|
||||
; DISABLED-NEXT: ld.u8 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1];
|
||||
; DISABLED-NEXT: ld.u8 %r2, [%rd1+1];
|
||||
; DISABLED-NEXT: ld.u8 %r3, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.u8 %r4, [%rd1+3];
|
||||
; DISABLED-NEXT: ld.u8 %r5, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.u8 %r6, [%rd1+5];
|
||||
; DISABLED-NEXT: ld.u8 %r7, [%rd1+6];
|
||||
; DISABLED-NEXT: ld.u8 %r8, [%rd1+7];
|
||||
; DISABLED-NEXT: ld.u8 %r9, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.u8 %r10, [%rd1+9];
|
||||
; DISABLED-NEXT: ld.u8 %r11, [%rd1+10];
|
||||
; DISABLED-NEXT: ld.u8 %r12, [%rd1+11];
|
||||
; DISABLED-NEXT: ld.u8 %r13, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.u8 %r14, [%rd1+13];
|
||||
; DISABLED-NEXT: ld.u8 %r15, [%rd1+14];
|
||||
; DISABLED-NEXT: ld.u8 %r16, [%rd1+15];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0];
|
||||
; DISABLED-NEXT: ld.b8 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1];
|
||||
; DISABLED-NEXT: ld.b8 %r2, [%rd1+1];
|
||||
; DISABLED-NEXT: ld.b8 %r3, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.b8 %r4, [%rd1+3];
|
||||
; DISABLED-NEXT: ld.b8 %r5, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.b8 %r6, [%rd1+5];
|
||||
; DISABLED-NEXT: ld.b8 %r7, [%rd1+6];
|
||||
; DISABLED-NEXT: ld.b8 %r8, [%rd1+7];
|
||||
; DISABLED-NEXT: ld.b8 %r9, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.b8 %r10, [%rd1+9];
|
||||
; DISABLED-NEXT: ld.b8 %r11, [%rd1+10];
|
||||
; DISABLED-NEXT: ld.b8 %r12, [%rd1+11];
|
||||
; DISABLED-NEXT: ld.b8 %r13, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.b8 %r14, [%rd1+13];
|
||||
; DISABLED-NEXT: ld.b8 %r15, [%rd1+14];
|
||||
; DISABLED-NEXT: ld.b8 %r16, [%rd1+15];
|
||||
; DISABLED-NEXT: add.s32 %r17, %r1, %r2;
|
||||
; DISABLED-NEXT: add.s32 %r18, %r17, %r3;
|
||||
; DISABLED-NEXT: add.s32 %r19, %r18, %r4;
|
||||
@@ -404,7 +404,7 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig
|
||||
; DISABLED-NEXT: add.s32 %r29, %r28, %r14;
|
||||
; DISABLED-NEXT: add.s32 %r30, %r29, %r15;
|
||||
; DISABLED-NEXT: add.s32 %r31, %r30, %r16;
|
||||
; DISABLED-NEXT: st.u32 [%rd2], %r31;
|
||||
; DISABLED-NEXT: st.b32 [%rd2], %r31;
|
||||
; DISABLED-NEXT: ret;
|
||||
%val0 = load i8, ptr %ptr1, align 8
|
||||
%ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1
|
||||
@@ -481,13 +481,13 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; ENABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v8i16_param_0];
|
||||
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; ENABLED-NEXT: mov.b32 {%rs1, %rs2}, %r4;
|
||||
; ENABLED-NEXT: mov.b32 {%rs3, %rs4}, %r3;
|
||||
; ENABLED-NEXT: mov.b32 {%rs5, %rs6}, %r2;
|
||||
; ENABLED-NEXT: mov.b32 {%rs7, %rs8}, %r1;
|
||||
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v8i16_param_1];
|
||||
; ENABLED-NEXT: cvt.u32.u16 %r5, %rs7;
|
||||
; ENABLED-NEXT: cvt.u32.u16 %r6, %rs8;
|
||||
; ENABLED-NEXT: cvt.u32.u16 %r7, %rs5;
|
||||
@@ -503,7 +503,7 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; ENABLED-NEXT: add.s32 %r17, %r16, %r10;
|
||||
; ENABLED-NEXT: add.s32 %r18, %r17, %r11;
|
||||
; ENABLED-NEXT: add.s32 %r19, %r18, %r12;
|
||||
; ENABLED-NEXT: st.u32 [%rd2], %r19;
|
||||
; ENABLED-NEXT: st.b32 [%rd2], %r19;
|
||||
; ENABLED-NEXT: ret;
|
||||
;
|
||||
; DISABLED-LABEL: combine_v8i16(
|
||||
@@ -512,16 +512,16 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; DISABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0];
|
||||
; DISABLED-NEXT: ld.u16 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1];
|
||||
; DISABLED-NEXT: ld.u16 %r2, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.u16 %r3, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.u16 %r4, [%rd1+6];
|
||||
; DISABLED-NEXT: ld.u16 %r5, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.u16 %r6, [%rd1+10];
|
||||
; DISABLED-NEXT: ld.u16 %r7, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.u16 %r8, [%rd1+14];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v8i16_param_0];
|
||||
; DISABLED-NEXT: ld.b16 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v8i16_param_1];
|
||||
; DISABLED-NEXT: ld.b16 %r2, [%rd1+2];
|
||||
; DISABLED-NEXT: ld.b16 %r3, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.b16 %r4, [%rd1+6];
|
||||
; DISABLED-NEXT: ld.b16 %r5, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.b16 %r6, [%rd1+10];
|
||||
; DISABLED-NEXT: ld.b16 %r7, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.b16 %r8, [%rd1+14];
|
||||
; DISABLED-NEXT: add.s32 %r9, %r1, %r2;
|
||||
; DISABLED-NEXT: add.s32 %r10, %r9, %r3;
|
||||
; DISABLED-NEXT: add.s32 %r11, %r10, %r4;
|
||||
@@ -529,7 +529,7 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; DISABLED-NEXT: add.s32 %r13, %r12, %r6;
|
||||
; DISABLED-NEXT: add.s32 %r14, %r13, %r7;
|
||||
; DISABLED-NEXT: add.s32 %r15, %r14, %r8;
|
||||
; DISABLED-NEXT: st.u32 [%rd2], %r15;
|
||||
; DISABLED-NEXT: st.b32 [%rd2], %r15;
|
||||
; DISABLED-NEXT: ret;
|
||||
%val0 = load i16, ptr %ptr1, align 16
|
||||
%ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1
|
||||
@@ -572,13 +572,13 @@ define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; ENABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; ENABLED-EMPTY:
|
||||
; ENABLED-NEXT: // %bb.0:
|
||||
; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v4i32_param_0];
|
||||
; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v4i32_param_1];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v4i32_param_0];
|
||||
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v4i32_param_1];
|
||||
; ENABLED-NEXT: add.s32 %r5, %r1, %r2;
|
||||
; ENABLED-NEXT: add.s32 %r6, %r5, %r3;
|
||||
; ENABLED-NEXT: add.s32 %r7, %r6, %r4;
|
||||
; ENABLED-NEXT: st.u32 [%rd2], %r7;
|
||||
; ENABLED-NEXT: st.b32 [%rd2], %r7;
|
||||
; ENABLED-NEXT: ret;
|
||||
;
|
||||
; DISABLED-LABEL: combine_v4i32(
|
||||
@@ -587,16 +587,16 @@ define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr
|
||||
; DISABLED-NEXT: .reg .b64 %rd<3>;
|
||||
; DISABLED-EMPTY:
|
||||
; DISABLED-NEXT: // %bb.0:
|
||||
; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v4i32_param_0];
|
||||
; DISABLED-NEXT: ld.u32 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v4i32_param_1];
|
||||
; DISABLED-NEXT: ld.u32 %r2, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.u32 %r3, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.u32 %r4, [%rd1+12];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v4i32_param_0];
|
||||
; DISABLED-NEXT: ld.b32 %r1, [%rd1];
|
||||
; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v4i32_param_1];
|
||||
; DISABLED-NEXT: ld.b32 %r2, [%rd1+4];
|
||||
; DISABLED-NEXT: ld.b32 %r3, [%rd1+8];
|
||||
; DISABLED-NEXT: ld.b32 %r4, [%rd1+12];
|
||||
; DISABLED-NEXT: add.s32 %r5, %r1, %r2;
|
||||
; DISABLED-NEXT: add.s32 %r6, %r5, %r3;
|
||||
; DISABLED-NEXT: add.s32 %r7, %r6, %r4;
|
||||
; DISABLED-NEXT: st.u32 [%rd2], %r7;
|
||||
; DISABLED-NEXT: st.b32 [%rd2], %r7;
|
||||
; DISABLED-NEXT: ret;
|
||||
%val0 = load i32, ptr %ptr1, align 16
|
||||
%ptr1.1 = getelementptr inbounds i32, ptr %ptr1, i64 1
|
||||
|
||||
@@ -10,7 +10,7 @@ declare void @foo()
|
||||
; the call may modify memory.
|
||||
define i32 @f(i32 %x, ptr %ptr, i1 %cond) {
|
||||
Start:
|
||||
; CHECK: ld.u32
|
||||
; CHECK: ld.b32
|
||||
%ptr_val = load i32, ptr %ptr
|
||||
; CHECK: call.uni
|
||||
call void @foo()
|
||||
|
||||
@@ -10,7 +10,7 @@ declare void @llvm.nvvm.barrier0()
|
||||
; syncthreads is modeled as maystore.
|
||||
define i32 @f(i32 %x, ptr %ptr, i1 %cond) {
|
||||
Start:
|
||||
; CHECK: ld.u32
|
||||
; CHECK: ld.b32
|
||||
%ptr_val = load i32, ptr %ptr
|
||||
; CHECK: bar.sync
|
||||
call void @llvm.nvvm.barrier0()
|
||||
|
||||
@@ -23,10 +23,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
; load cast
|
||||
%1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
|
||||
call void @use(float %1)
|
||||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
|
||||
; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar];
|
||||
; store cast
|
||||
store float %v, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4
|
||||
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
|
||||
; PTX: st.shared.b32 [scalar], %f{{[0-9]+}};
|
||||
; use syncthreads to disable optimizations across components
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
@@ -35,20 +35,20 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
%2 = addrspacecast ptr addrspace(3) @scalar to ptr
|
||||
%3 = load float, ptr %2, align 4
|
||||
call void @use(float %3)
|
||||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar];
|
||||
; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar];
|
||||
; cast; store
|
||||
store float %v, ptr %2, align 4
|
||||
; PTX: st.shared.f32 [scalar], %f{{[0-9]+}};
|
||||
; PTX: st.shared.b32 [scalar], %f{{[0-9]+}};
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
; load gep cast
|
||||
%4 = load float, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4
|
||||
call void @use(float %4)
|
||||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
|
||||
; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20];
|
||||
; store gep cast
|
||||
store float %v, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4
|
||||
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
|
||||
; PTX: st.shared.b32 [array+20], %f{{[0-9]+}};
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
@@ -56,10 +56,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
%5 = getelementptr inbounds [10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5
|
||||
%6 = load float, ptr %5, align 4
|
||||
call void @use(float %6)
|
||||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20];
|
||||
; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20];
|
||||
; gep cast; store
|
||||
store float %v, ptr %5, align 4
|
||||
; PTX: st.shared.f32 [array+20], %f{{[0-9]+}};
|
||||
; PTX: st.shared.b32 [array+20], %f{{[0-9]+}};
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
@@ -68,10 +68,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) {
|
||||
%8 = getelementptr inbounds [10 x float], ptr %7, i32 0, i32 %i
|
||||
%9 = load float, ptr %8, align 4
|
||||
call void @use(float %9)
|
||||
; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
|
||||
; PTX: ld.shared.b32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}];
|
||||
; cast; gep; store
|
||||
store float %v, ptr %8, align 4
|
||||
; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
|
||||
; PTX: st.shared.b32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}};
|
||||
call void @llvm.nvvm.barrier0()
|
||||
; PTX: bar.sync 0;
|
||||
|
||||
@@ -84,7 +84,7 @@ define i32 @ld_int_from_float() {
|
||||
; IR-LABEL: @ld_int_from_float
|
||||
; IR: load i32, ptr addrspace(3) @scalar
|
||||
; PTX-LABEL: ld_int_from_float(
|
||||
; PTX: ld.shared.u{{(32|64)}}
|
||||
; PTX: ld.shared.b{{(32|64)}}
|
||||
%1 = load i32, ptr addrspacecast(ptr addrspace(3) @scalar to ptr), align 4
|
||||
ret i32 %1
|
||||
}
|
||||
@@ -108,7 +108,7 @@ define void @nested_const_expr() {
|
||||
; store 1 to bitcast(gep(addrspacecast(array), 0, 1))
|
||||
store i32 1, ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1), align 4
|
||||
; PTX: mov.b32 %r1, 1;
|
||||
; PTX-NEXT: st.shared.u32 [array+4], %r1;
|
||||
; PTX-NEXT: st.shared.b32 [array+4], %r1;
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@@ -10,8 +10,8 @@ define i32 @test_addr_mode_i64(ptr %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i64_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%addr = getelementptr i32, ptr %x, i64 -1
|
||||
@@ -26,8 +26,8 @@ define i32 @test_addr_mode_i32(ptr %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i32_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i32_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%addr = getelementptr i32, ptr %x, i32 -1
|
||||
@@ -42,8 +42,8 @@ define i32 @test_addr_mode_i16(ptr %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i16_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i16_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%addr = getelementptr i32, ptr %x, i16 -1
|
||||
@@ -58,8 +58,8 @@ define i32 @test_addr_mode_i8(ptr %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i8_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i8_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1+-4];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%addr = getelementptr i32, ptr %x, i8 -1
|
||||
@@ -74,9 +74,9 @@ define i32 @test_addr_mode_i64_large(ptr %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_large_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i64_large_param_0];
|
||||
; CHECK-NEXT: add.s64 %rd2, %rd1, 17179869172;
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd2];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd2];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%addr = getelementptr i32, ptr %x, i64 4294967293
|
||||
|
||||
@@ -10,7 +10,7 @@ define ptr @test1(ptr %p) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test1_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test1_param_0];
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd1;
|
||||
; CHECK-NEXT: ret;
|
||||
%a = addrspacecast ptr %p to ptr addrspace(5)
|
||||
@@ -24,7 +24,7 @@ define ptr addrspace(1) @test2(ptr addrspace(5) %p) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test2_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test2_param_0];
|
||||
; CHECK-NEXT: cvta.local.u64 %rd2, %rd1;
|
||||
; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
|
||||
@@ -13,9 +13,9 @@ define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
|
||||
; NOPTRCONV-NEXT: .reg .b64 %rd<3>;
|
||||
; NOPTRCONV-EMPTY:
|
||||
; NOPTRCONV-NEXT: // %bb.0:
|
||||
; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_generic_param_0];
|
||||
; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_shared_cluster_to_generic_param_0];
|
||||
; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
|
||||
; NOPTRCONV-NEXT: ld.u32 %r1, [%rd2];
|
||||
; NOPTRCONV-NEXT: ld.b32 %r1, [%rd2];
|
||||
; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; NOPTRCONV-NEXT: ret;
|
||||
;
|
||||
@@ -25,10 +25,10 @@ define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
|
||||
; PTRCONV-NEXT: .reg .b64 %rd<3>;
|
||||
; PTRCONV-EMPTY:
|
||||
; PTRCONV-NEXT: // %bb.0:
|
||||
; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_generic_param_0];
|
||||
; PTRCONV-NEXT: ld.param.b32 %r1, [conv_shared_cluster_to_generic_param_0];
|
||||
; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1;
|
||||
; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
|
||||
; PTRCONV-NEXT: ld.u32 %r2, [%rd2];
|
||||
; PTRCONV-NEXT: ld.b32 %r2, [%rd2];
|
||||
; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; PTRCONV-NEXT: ret;
|
||||
%genptr = addrspacecast ptr addrspace(7) %ptr to ptr
|
||||
@@ -45,9 +45,9 @@ define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
|
||||
; NOPTRCONV-NEXT: .reg .b64 %rd<3>;
|
||||
; NOPTRCONV-EMPTY:
|
||||
; NOPTRCONV-NEXT: // %bb.0:
|
||||
; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0];
|
||||
; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_generic_to_shared_cluster_param_0];
|
||||
; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1;
|
||||
; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd2];
|
||||
; NOPTRCONV-NEXT: ld.shared::cluster.b32 %r1, [%rd2];
|
||||
; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; NOPTRCONV-NEXT: ret;
|
||||
;
|
||||
@@ -57,10 +57,10 @@ define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
|
||||
; PTRCONV-NEXT: .reg .b64 %rd<3>;
|
||||
; PTRCONV-EMPTY:
|
||||
; PTRCONV-NEXT: // %bb.0:
|
||||
; PTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0];
|
||||
; PTRCONV-NEXT: ld.param.b64 %rd1, [conv_generic_to_shared_cluster_param_0];
|
||||
; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1;
|
||||
; PTRCONV-NEXT: cvt.u32.u64 %r1, %rd2;
|
||||
; PTRCONV-NEXT: ld.shared::cluster.u32 %r2, [%r1];
|
||||
; PTRCONV-NEXT: ld.shared::cluster.b32 %r2, [%r1];
|
||||
; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; PTRCONV-NEXT: ret;
|
||||
%specptr = addrspacecast ptr %ptr to ptr addrspace(7)
|
||||
@@ -76,10 +76,10 @@ define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
|
||||
; NOPTRCONV-NEXT: .reg .b64 %rd<4>;
|
||||
; NOPTRCONV-EMPTY:
|
||||
; NOPTRCONV-NEXT: // %bb.0:
|
||||
; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_to_shared_cluster_param_0];
|
||||
; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_shared_to_shared_cluster_param_0];
|
||||
; NOPTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1;
|
||||
; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2;
|
||||
; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd3];
|
||||
; NOPTRCONV-NEXT: ld.shared::cluster.b32 %r1, [%rd3];
|
||||
; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; NOPTRCONV-NEXT: ret;
|
||||
;
|
||||
@@ -89,12 +89,12 @@ define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
|
||||
; PTRCONV-NEXT: .reg .b64 %rd<4>;
|
||||
; PTRCONV-EMPTY:
|
||||
; PTRCONV-NEXT: // %bb.0:
|
||||
; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_to_shared_cluster_param_0];
|
||||
; PTRCONV-NEXT: ld.param.b32 %r1, [conv_shared_to_shared_cluster_param_0];
|
||||
; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1;
|
||||
; PTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1;
|
||||
; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2;
|
||||
; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3;
|
||||
; PTRCONV-NEXT: ld.shared::cluster.u32 %r3, [%r2];
|
||||
; PTRCONV-NEXT: ld.shared::cluster.b32 %r3, [%r2];
|
||||
; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; PTRCONV-NEXT: ret;
|
||||
%specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7)
|
||||
@@ -110,10 +110,10 @@ define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
|
||||
; NOPTRCONV-NEXT: .reg .b64 %rd<4>;
|
||||
; NOPTRCONV-EMPTY:
|
||||
; NOPTRCONV-NEXT: // %bb.0:
|
||||
; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_shared_param_0];
|
||||
; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_shared_cluster_to_shared_param_0];
|
||||
; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
|
||||
; NOPTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2;
|
||||
; NOPTRCONV-NEXT: ld.shared.u32 %r1, [%rd3];
|
||||
; NOPTRCONV-NEXT: ld.shared.b32 %r1, [%rd3];
|
||||
; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; NOPTRCONV-NEXT: ret;
|
||||
;
|
||||
@@ -123,12 +123,12 @@ define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
|
||||
; PTRCONV-NEXT: .reg .b64 %rd<4>;
|
||||
; PTRCONV-EMPTY:
|
||||
; PTRCONV-NEXT: // %bb.0:
|
||||
; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_shared_param_0];
|
||||
; PTRCONV-NEXT: ld.param.b32 %r1, [conv_shared_cluster_to_shared_param_0];
|
||||
; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1;
|
||||
; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1;
|
||||
; PTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2;
|
||||
; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3;
|
||||
; PTRCONV-NEXT: ld.shared.u32 %r3, [%r2];
|
||||
; PTRCONV-NEXT: ld.shared.b32 %r3, [%r2];
|
||||
; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; PTRCONV-NEXT: ret;
|
||||
%specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3)
|
||||
|
||||
@@ -10,7 +10,7 @@ define i32 @conv1(ptr addrspace(1) %ptr) {
|
||||
; CLS32: cvta.global.u32
|
||||
; ALL-NOT: cvt.u64.u32
|
||||
; CLS64: cvta.global.u64
|
||||
; ALL: ld.u32
|
||||
; ALL: ld.b32
|
||||
%genptr = addrspacecast ptr addrspace(1) %ptr to ptr
|
||||
%val = load i32, ptr %genptr
|
||||
ret i32 %val
|
||||
@@ -22,7 +22,7 @@ define i32 @conv2(ptr addrspace(3) %ptr) {
|
||||
; PTRCONV: cvt.u64.u32
|
||||
; NOPTRCONV-NOT: cvt.u64.u32
|
||||
; CLS64: cvta.shared.u64
|
||||
; ALL: ld.u32
|
||||
; ALL: ld.b32
|
||||
%genptr = addrspacecast ptr addrspace(3) %ptr to ptr
|
||||
%val = load i32, ptr %genptr
|
||||
ret i32 %val
|
||||
@@ -34,7 +34,7 @@ define i32 @conv3(ptr addrspace(4) %ptr) {
|
||||
; PTRCONV: cvt.u64.u32
|
||||
; NOPTRCONV-NOT: cvt.u64.u32
|
||||
; CLS64: cvta.const.u64
|
||||
; ALL: ld.u32
|
||||
; ALL: ld.b32
|
||||
%genptr = addrspacecast ptr addrspace(4) %ptr to ptr
|
||||
%val = load i32, ptr %genptr
|
||||
ret i32 %val
|
||||
@@ -46,7 +46,7 @@ define i32 @conv4(ptr addrspace(5) %ptr) {
|
||||
; PTRCONV: cvt.u64.u32
|
||||
; NOPTRCONV-NOT: cvt.u64.u32
|
||||
; CLS64: cvta.local.u64
|
||||
; ALL: ld.u32
|
||||
; ALL: ld.b32
|
||||
%genptr = addrspacecast ptr addrspace(5) %ptr to ptr
|
||||
%val = load i32, ptr %genptr
|
||||
ret i32 %val
|
||||
@@ -57,7 +57,7 @@ define i32 @conv5(ptr %ptr) {
|
||||
; CLS32: cvta.to.global.u32
|
||||
; ALL-NOT: cvt.u64.u32
|
||||
; CLS64: cvta.to.global.u64
|
||||
; ALL: ld.global.u32
|
||||
; ALL: ld.global.b32
|
||||
%specptr = addrspacecast ptr %ptr to ptr addrspace(1)
|
||||
%val = load i32, ptr addrspace(1) %specptr
|
||||
ret i32 %val
|
||||
@@ -69,7 +69,7 @@ define i32 @conv6(ptr %ptr) {
|
||||
; CLS64: cvta.to.shared.u64
|
||||
; PTRCONV: cvt.u32.u64
|
||||
; NOPTRCONV-NOT: cvt.u32.u64
|
||||
; ALL: ld.shared.u32
|
||||
; ALL: ld.shared.b32
|
||||
%specptr = addrspacecast ptr %ptr to ptr addrspace(3)
|
||||
%val = load i32, ptr addrspace(3) %specptr
|
||||
ret i32 %val
|
||||
@@ -81,7 +81,7 @@ define i32 @conv7(ptr %ptr) {
|
||||
; CLS64: cvta.to.const.u64
|
||||
; PTRCONV: cvt.u32.u64
|
||||
; NOPTRCONV-NOT: cvt.u32.u64
|
||||
; ALL: ld.const.u32
|
||||
; ALL: ld.const.b32
|
||||
%specptr = addrspacecast ptr %ptr to ptr addrspace(4)
|
||||
%val = load i32, ptr addrspace(4) %specptr
|
||||
ret i32 %val
|
||||
@@ -93,7 +93,7 @@ define i32 @conv8(ptr %ptr) {
|
||||
; CLS64: cvta.to.local.u64
|
||||
; PTRCONV: cvt.u32.u64
|
||||
; NOPTRCONV-NOT: cvt.u32.u64
|
||||
; ALL: ld.local.u32
|
||||
; ALL: ld.local.b32
|
||||
%specptr = addrspacecast ptr %ptr to ptr addrspace(5)
|
||||
%val = load i32, ptr addrspace(5) %specptr
|
||||
ret i32 %val
|
||||
@@ -104,7 +104,7 @@ define i32 @conv9(ptr addrspace(1) %ptr) {
|
||||
; CLS32: // implicit-def: %[[ADDR:r[0-9]+]]
|
||||
; PTRCONV: // implicit-def: %[[ADDR:r[0-9]+]]
|
||||
; NOPTRCONV: // implicit-def: %[[ADDR:rd[0-9]+]]
|
||||
; ALL: ld.shared.u32 %r{{[0-9]+}}, [%[[ADDR]]]
|
||||
; ALL: ld.shared.b32 %r{{[0-9]+}}, [%[[ADDR]]]
|
||||
%specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3)
|
||||
%val = load i32, ptr addrspace(3) %specptr
|
||||
ret i32 %val
|
||||
@@ -120,8 +120,8 @@ define void @split1To0(ptr nocapture noundef readonly %xs) {
|
||||
; CLS32: cvta.global.u32
|
||||
; CLS64: cvta.global.u64
|
||||
; CLS64: cvta.global.u64
|
||||
; ALL: st.u32
|
||||
; ALL: st.u32
|
||||
; ALL: st.b32
|
||||
; ALL: st.b32
|
||||
%vec_addr = load <2 x ptr addrspace(1)>, ptr %xs, align 16
|
||||
%addrspacecast = addrspacecast <2 x ptr addrspace(1)> %vec_addr to <2 x ptr>
|
||||
%extractelement0 = extractelement <2 x ptr> %addrspacecast, i64 0
|
||||
@@ -139,8 +139,8 @@ define void @split0To1(ptr nocapture noundef readonly %xs) {
|
||||
; CLS32: cvta.to.global.u32
|
||||
; CLS64: cvta.to.global.u64
|
||||
; CLS64: cvta.to.global.u64
|
||||
; ALL: st.global.u32
|
||||
; ALL: st.global.u32
|
||||
; ALL: st.global.b32
|
||||
; ALL: st.global.b32
|
||||
%vec_addr = load <2 x ptr>, ptr %xs, align 16
|
||||
%addrspacecast = addrspacecast <2 x ptr> %vec_addr to <2 x ptr addrspace(1)>
|
||||
%extractelement0 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 0
|
||||
@@ -162,9 +162,9 @@ define void @widen1To0(ptr nocapture noundef readonly %xs) {
|
||||
; CLS64: cvta.global.u64
|
||||
; CLS64: cvta.global.u64
|
||||
|
||||
; ALL: st.u32
|
||||
; ALL: st.u32
|
||||
; ALL: st.u32
|
||||
; ALL: st.b32
|
||||
; ALL: st.b32
|
||||
; ALL: st.b32
|
||||
%vec_addr = load <3 x ptr addrspace(1)>, ptr %xs, align 16
|
||||
%addrspacecast = addrspacecast <3 x ptr addrspace(1)> %vec_addr to <3 x ptr>
|
||||
%extractelement0 = extractelement <3 x ptr> %addrspacecast, i64 0
|
||||
@@ -188,9 +188,9 @@ define void @widen0To1(ptr nocapture noundef readonly %xs) {
|
||||
; CLS64: cvta.to.global.u64
|
||||
; CLS64: cvta.to.global.u64
|
||||
|
||||
; ALL: st.global.u32
|
||||
; ALL: st.global.u32
|
||||
; ALL: st.global.u32
|
||||
; ALL: st.global.b32
|
||||
; ALL: st.global.b32
|
||||
; ALL: st.global.b32
|
||||
%vec_addr = load <3 x ptr>, ptr %xs, align 16
|
||||
%addrspacecast = addrspacecast <3 x ptr> %vec_addr to <3 x ptr addrspace(1)>
|
||||
%extractelement0 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 0
|
||||
|
||||
@@ -10,9 +10,9 @@ define void @test_v2f32(<2 x float> %input, ptr %output) {
|
||||
; CHECK-LABEL: @test_v2f32
|
||||
%call = tail call <2 x float> @barv(<2 x float> %input)
|
||||
; CHECK: .param .align 8 .b8 retval0[8];
|
||||
; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
|
||||
; CHECK: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
|
||||
store <2 x float> %call, ptr %output, align 8
|
||||
; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
|
||||
; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -21,15 +21,15 @@ define void @test_v3f32(<3 x float> %input, ptr %output) {
|
||||
;
|
||||
%call = tail call <3 x float> @barv3(<3 x float> %input)
|
||||
; CHECK: .param .align 16 .b8 retval0[16];
|
||||
; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
|
||||
; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8];
|
||||
; CHECK-DAG: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0];
|
||||
; CHECK-DAG: ld.param.b32 [[E2:%f[0-9]+]], [retval0+8];
|
||||
; Make sure we don't load more values than than we need to.
|
||||
; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
|
||||
; CHECK-NOT: ld.param.b32 [[E3:%f[0-9]+]], [retval0+12];
|
||||
store <3 x float> %call, ptr %output, align 8
|
||||
; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
|
||||
; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8],
|
||||
; -- This is suboptimal. We should do st.v2.f32 instead
|
||||
; of combining 2xf32 info i64.
|
||||
; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
|
||||
; CHECK-DAG: st.b64 [{{%rd[0-9]}}],
|
||||
; CHECK: ret;
|
||||
ret void
|
||||
}
|
||||
@@ -38,12 +38,12 @@ define void @test_a2f32([2 x float] %input, ptr %output) {
|
||||
; CHECK-LABEL: @test_a2f32
|
||||
%call = tail call [2 x float] @bara([2 x float] %input)
|
||||
; CHECK: .param .align 4 .b8 retval0[8];
|
||||
; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0];
|
||||
; CHECK-DAG: ld.param.f32 [[ELEMA2:%f[0-9]+]], [retval0+4];
|
||||
; CHECK-DAG: ld.param.b32 [[ELEMA1:%f[0-9]+]], [retval0];
|
||||
; CHECK-DAG: ld.param.b32 [[ELEMA2:%f[0-9]+]], [retval0+4];
|
||||
store [2 x float] %call, ptr %output, align 4
|
||||
; CHECK: }
|
||||
; CHECK-DAG: st.f32 [{{%rd[0-9]+}}], [[ELEMA1]]
|
||||
; CHECK-DAG: st.f32 [{{%rd[0-9]+}}+4], [[ELEMA2]]
|
||||
; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]]
|
||||
; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMA2]]
|
||||
ret void
|
||||
; CHECK: ret
|
||||
}
|
||||
@@ -52,12 +52,12 @@ define void @test_s2f32({float, float} %input, ptr %output) {
|
||||
; CHECK-LABEL: @test_s2f32
|
||||
%call = tail call {float, float} @bars({float, float} %input)
|
||||
; CHECK: .param .align 4 .b8 retval0[8];
|
||||
; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0];
|
||||
; CHECK-DAG: ld.param.f32 [[ELEMS2:%f[0-9]+]], [retval0+4];
|
||||
; CHECK-DAG: ld.param.b32 [[ELEMS1:%f[0-9]+]], [retval0];
|
||||
; CHECK-DAG: ld.param.b32 [[ELEMS2:%f[0-9]+]], [retval0+4];
|
||||
store {float, float} %call, ptr %output, align 4
|
||||
; CHECK: }
|
||||
; CHECK-DAG: st.f32 [{{%rd[0-9]+}}], [[ELEMS1]]
|
||||
; CHECK-DAG: st.f32 [{{%rd[0-9]+}}+4], [[ELEMS2]]
|
||||
; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]]
|
||||
; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMS2]]
|
||||
ret void
|
||||
; CHECK: ret
|
||||
}
|
||||
|
||||
@@ -12,8 +12,8 @@ define i1 @and_ord(float %a, float %b) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [and_ord_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [and_ord_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [and_ord_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [and_ord_param_1];
|
||||
; CHECK-NEXT: setp.num.f32 %p1, %f1, %f2;
|
||||
; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -32,8 +32,8 @@ define i1 @or_uno(float %a, float %b) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [or_uno_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [or_uno_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [or_uno_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [or_uno_param_1];
|
||||
; CHECK-NEXT: setp.nan.f32 %p1, %f1, %f2;
|
||||
; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
; CHECK: .func (.param .b32 func_retval0) __unnamed_1(
|
||||
; CHECK-NEXT: .param .b32 __unnamed_1_param_0
|
||||
; CHECK: ld.param.u32 {{%r[0-9]+}}, [__unnamed_1_param_0];
|
||||
; CHECK: ld.param.b32 {{%r[0-9]+}}, [__unnamed_1_param_0];
|
||||
|
||||
define internal i32 @0(i32 %a) {
|
||||
entry:
|
||||
@@ -16,7 +16,7 @@ entry:
|
||||
|
||||
; CHECK: .func (.param .b32 func_retval0) __unnamed_2(
|
||||
; CHECK-NEXT: .param .b32 __unnamed_2_param_0
|
||||
; CHECK: ld.param.u32 {{%r[0-9]+}}, [__unnamed_2_param_0];
|
||||
; CHECK: ld.param.b32 {{%r[0-9]+}}, [__unnamed_2_param_0];
|
||||
|
||||
define internal i32 @1(i32 %a) {
|
||||
entry:
|
||||
|
||||
@@ -13,7 +13,7 @@ define void @applypriority_global_L2(ptr addrspace(1) %global_ptr, i64 %size) {
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_global_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [applypriority_global_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: applypriority.global.L2::evict_normal [%rd1], 128;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
tail call void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 128)
|
||||
@@ -26,7 +26,7 @@ define void @applypriority_L2(ptr %ptr, i64 %size) {
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [applypriority_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: applypriority.L2::evict_normal [%rd1], 128;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
tail call void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 128)
|
||||
|
||||
@@ -15,13 +15,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3];
|
||||
; CHECK-NEXT: atom.add.noftz.f16 %rs2, [%r1], %rs1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_param_1];
|
||||
; CHECK-NEXT: mov.b16 %rs3, 0x3C00;
|
||||
; CHECK-NEXT: atom.add.noftz.f16 %rs4, [%r1], %rs3;
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_param_2];
|
||||
; CHECK-NEXT: atom.global.add.noftz.f16 %rs5, [%r2], %rs1;
|
||||
; CHECK-NEXT: atom.shared.add.noftz.f16 %rs6, [%r3], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -32,13 +32,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
|
||||
; CHECK64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK64-EMPTY:
|
||||
; CHECK64-NEXT: // %bb.0:
|
||||
; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0];
|
||||
; CHECK64-NEXT: ld.param.b64 %rd1, [test_param_0];
|
||||
; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3];
|
||||
; CHECK64-NEXT: atom.add.noftz.f16 %rs2, [%rd1], %rs1;
|
||||
; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1];
|
||||
; CHECK64-NEXT: ld.param.b64 %rd2, [test_param_1];
|
||||
; CHECK64-NEXT: mov.b16 %rs3, 0x3C00;
|
||||
; CHECK64-NEXT: atom.add.noftz.f16 %rs4, [%rd1], %rs3;
|
||||
; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2];
|
||||
; CHECK64-NEXT: ld.param.b64 %rd3, [test_param_2];
|
||||
; CHECK64-NEXT: atom.global.add.noftz.f16 %rs5, [%rd2], %rs1;
|
||||
; CHECK64-NEXT: atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1;
|
||||
; CHECK64-NEXT: ret;
|
||||
@@ -51,16 +51,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
|
||||
; CHECKPTX62-EMPTY:
|
||||
; CHECKPTX62-NEXT: // %bb.0:
|
||||
; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3];
|
||||
; CHECKPTX62-NEXT: ld.param.u32 %r23, [test_param_2];
|
||||
; CHECKPTX62-NEXT: ld.param.u32 %r22, [test_param_1];
|
||||
; CHECKPTX62-NEXT: ld.param.u32 %r24, [test_param_0];
|
||||
; CHECKPTX62-NEXT: ld.param.b32 %r23, [test_param_2];
|
||||
; CHECKPTX62-NEXT: ld.param.b32 %r22, [test_param_1];
|
||||
; CHECKPTX62-NEXT: ld.param.b32 %r24, [test_param_0];
|
||||
; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4;
|
||||
; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3;
|
||||
; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3;
|
||||
; CHECKPTX62-NEXT: mov.b32 %r26, 65535;
|
||||
; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2;
|
||||
; CHECKPTX62-NEXT: not.b32 %r3, %r27;
|
||||
; CHECKPTX62-NEXT: ld.u32 %r54, [%r1];
|
||||
; CHECKPTX62-NEXT: ld.b32 %r54, [%r1];
|
||||
; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45
|
||||
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2;
|
||||
@@ -75,7 +75,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
|
||||
; CHECKPTX62-NEXT: mov.b32 %r54, %r6;
|
||||
; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1;
|
||||
; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44
|
||||
; CHECKPTX62-NEXT: ld.u32 %r55, [%r1];
|
||||
; CHECKPTX62-NEXT: ld.b32 %r55, [%r1];
|
||||
; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27
|
||||
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2;
|
||||
@@ -97,7 +97,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
|
||||
; CHECKPTX62-NEXT: mov.b32 %r39, 65535;
|
||||
; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11;
|
||||
; CHECKPTX62-NEXT: not.b32 %r12, %r40;
|
||||
; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10];
|
||||
; CHECKPTX62-NEXT: ld.global.b32 %r56, [%r10];
|
||||
; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9
|
||||
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11;
|
||||
@@ -118,7 +118,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
|
||||
; CHECKPTX62-NEXT: mov.b32 %r47, 65535;
|
||||
; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17;
|
||||
; CHECKPTX62-NEXT: not.b32 %r18, %r48;
|
||||
; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16];
|
||||
; CHECKPTX62-NEXT: ld.shared.b32 %r57, [%r16];
|
||||
; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start
|
||||
; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17;
|
||||
|
||||
@@ -15,13 +15,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3];
|
||||
; CHECK-NEXT: atom.add.noftz.bf16 %rs2, [%r1], %rs1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_param_1];
|
||||
; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
|
||||
; CHECK-NEXT: atom.add.noftz.bf16 %rs4, [%r1], %rs3;
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_param_2];
|
||||
; CHECK-NEXT: atom.global.add.noftz.bf16 %rs5, [%r2], %rs1;
|
||||
; CHECK-NEXT: atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -32,13 +32,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
|
||||
; CHECK64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK64-EMPTY:
|
||||
; CHECK64-NEXT: // %bb.0:
|
||||
; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0];
|
||||
; CHECK64-NEXT: ld.param.b64 %rd1, [test_param_0];
|
||||
; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3];
|
||||
; CHECK64-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1;
|
||||
; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1];
|
||||
; CHECK64-NEXT: ld.param.b64 %rd2, [test_param_1];
|
||||
; CHECK64-NEXT: mov.b16 %rs3, 0x3F80;
|
||||
; CHECK64-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3;
|
||||
; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2];
|
||||
; CHECK64-NEXT: ld.param.b64 %rd3, [test_param_2];
|
||||
; CHECK64-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1;
|
||||
; CHECK64-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1;
|
||||
; CHECK64-NEXT: ret;
|
||||
@@ -51,16 +51,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
|
||||
; CHECKPTX71-EMPTY:
|
||||
; CHECKPTX71-NEXT: // %bb.0:
|
||||
; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
|
||||
; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
|
||||
; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
|
||||
; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
|
||||
; CHECKPTX71-NEXT: ld.param.b32 %r23, [test_param_2];
|
||||
; CHECKPTX71-NEXT: ld.param.b32 %r22, [test_param_1];
|
||||
; CHECKPTX71-NEXT: ld.param.b32 %r24, [test_param_0];
|
||||
; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
|
||||
; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
|
||||
; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
|
||||
; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
|
||||
; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
|
||||
; CHECKPTX71-NEXT: not.b32 %r3, %r27;
|
||||
; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
|
||||
; CHECKPTX71-NEXT: ld.b32 %r54, [%r1];
|
||||
; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
|
||||
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
|
||||
@@ -76,7 +76,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
|
||||
; CHECKPTX71-NEXT: mov.b32 %r54, %r6;
|
||||
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
|
||||
; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
|
||||
; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
|
||||
; CHECKPTX71-NEXT: ld.b32 %r55, [%r1];
|
||||
; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
|
||||
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
|
||||
@@ -98,7 +98,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
|
||||
; CHECKPTX71-NEXT: mov.b32 %r39, 65535;
|
||||
; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11;
|
||||
; CHECKPTX71-NEXT: not.b32 %r12, %r40;
|
||||
; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
|
||||
; CHECKPTX71-NEXT: ld.global.b32 %r56, [%r10];
|
||||
; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
|
||||
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
|
||||
@@ -120,7 +120,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
|
||||
; CHECKPTX71-NEXT: mov.b32 %r47, 65535;
|
||||
; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17;
|
||||
; CHECKPTX71-NEXT: not.b32 %r18, %r48;
|
||||
; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
|
||||
; CHECKPTX71-NEXT: ld.shared.b32 %r57, [%r16];
|
||||
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
|
||||
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
|
||||
|
||||
@@ -11,8 +11,8 @@ define i32 @atom0(ptr %addr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom0_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom0_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom0_param_1];
|
||||
; CHECK-NEXT: atom.add.u32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -27,8 +27,8 @@ define i64 @atom1(ptr %addr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom1_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom1_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom1_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom1_param_1];
|
||||
; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -44,8 +44,8 @@ define i32 @atom2(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom2_param_1];
|
||||
; CHECK-NEXT: neg.s32 %r2, %r1;
|
||||
; CHECK-NEXT: atom.add.u32 %r3, [%rd1], %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -61,8 +61,8 @@ define i64 @atom3(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom3_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom3_param_1];
|
||||
; CHECK-NEXT: neg.s64 %rd3, %rd2;
|
||||
; CHECK-NEXT: atom.add.u64 %rd4, [%rd1], %rd3;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd4;
|
||||
@@ -79,8 +79,8 @@ define i32 @atom4(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom4_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom4_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom4_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom4_param_1];
|
||||
; CHECK-NEXT: atom.and.b32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -95,8 +95,8 @@ define i64 @atom5(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom5_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom5_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom5_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom5_param_1];
|
||||
; CHECK-NEXT: atom.and.b64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -123,8 +123,8 @@ define i32 @atom8(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom8_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom8_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom8_param_1];
|
||||
; CHECK-NEXT: atom.or.b32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -139,8 +139,8 @@ define i64 @atom9(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom9_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom9_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom9_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom9_param_1];
|
||||
; CHECK-NEXT: atom.or.b64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -156,8 +156,8 @@ define i32 @atom10(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom10_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom10_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom10_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom10_param_1];
|
||||
; CHECK-NEXT: atom.xor.b32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -172,8 +172,8 @@ define i64 @atom11(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom11_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom11_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom11_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom11_param_1];
|
||||
; CHECK-NEXT: atom.xor.b64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -189,8 +189,8 @@ define i32 @atom12(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom12_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom12_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom12_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom12_param_1];
|
||||
; CHECK-NEXT: atom.max.s32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -205,8 +205,8 @@ define i64 @atom13(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom13_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom13_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom13_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom13_param_1];
|
||||
; CHECK-NEXT: atom.max.s64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -222,8 +222,8 @@ define i32 @atom14(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom14_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom14_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom14_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom14_param_1];
|
||||
; CHECK-NEXT: atom.min.s32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -238,8 +238,8 @@ define i64 @atom15(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom15_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom15_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom15_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom15_param_1];
|
||||
; CHECK-NEXT: atom.min.s64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -255,8 +255,8 @@ define i32 @atom16(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom16_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom16_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom16_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom16_param_1];
|
||||
; CHECK-NEXT: atom.max.u32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -271,8 +271,8 @@ define i64 @atom17(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom17_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom17_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom17_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom17_param_1];
|
||||
; CHECK-NEXT: atom.max.u64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -288,8 +288,8 @@ define i32 @atom18(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom18_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom18_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom18_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom18_param_1];
|
||||
; CHECK-NEXT: atom.min.u32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -304,8 +304,8 @@ define i64 @atom19(ptr %subr, i64 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom19_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atom19_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom19_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atom19_param_1];
|
||||
; CHECK-NEXT: atom.min.u64 %rd3, [%rd1], %rd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -320,8 +320,8 @@ define i32 @atom20(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom20_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom20_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom20_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom20_param_1];
|
||||
; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -336,8 +336,8 @@ define i32 @atom21(ptr %subr, i32 %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atom21_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atom21_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atom21_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atom21_param_1];
|
||||
; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -355,10 +355,10 @@ define float @atomic_add_f32_generic(ptr %addr, float %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_generic_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_generic_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_generic_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [atomic_add_f32_generic_param_1];
|
||||
; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val)
|
||||
ret float %ret
|
||||
@@ -374,10 +374,10 @@ define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace1_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace1_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_addrspace1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [atomic_add_f32_addrspace1_param_1];
|
||||
; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val)
|
||||
ret float %ret
|
||||
@@ -393,10 +393,10 @@ define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace3_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace3_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_addrspace3_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [atomic_add_f32_addrspace3_param_1];
|
||||
; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val)
|
||||
ret float %ret
|
||||
@@ -410,10 +410,10 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_generic_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_generic_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_generic_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [atomicrmw_add_f32_generic_param_1];
|
||||
; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = atomicrmw fadd ptr %addr, float %val seq_cst
|
||||
ret float %ret
|
||||
@@ -431,7 +431,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atomicrmw_add_f16_generic_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0];
|
||||
; CHECK-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; CHECK-NEXT: cvt.u32.u64 %r6, %rd2;
|
||||
; CHECK-NEXT: and.b32 %r7, %r6, 3;
|
||||
@@ -439,7 +439,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) {
|
||||
; CHECK-NEXT: mov.b32 %r8, 65535;
|
||||
; CHECK-NEXT: shl.b32 %r9, %r8, %r1;
|
||||
; CHECK-NEXT: not.b32 %r2, %r9;
|
||||
; CHECK-NEXT: ld.u32 %r16, [%rd1];
|
||||
; CHECK-NEXT: ld.b32 %r16, [%rd1];
|
||||
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
|
||||
; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -474,10 +474,10 @@ define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace1_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [atomicrmw_add_f32_addrspace1_param_1];
|
||||
; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst
|
||||
ret float %ret
|
||||
@@ -491,10 +491,10 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace3_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace3_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [atomicrmw_add_f32_addrspace3_param_1];
|
||||
; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst
|
||||
ret float %ret
|
||||
@@ -508,10 +508,10 @@ define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i32_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_cmpxchg_i32_param_0];
|
||||
; CHECK-NEXT: membar.sys;
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [atomic_cmpxchg_i32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [atomic_cmpxchg_i32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [atomic_cmpxchg_i32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [atomic_cmpxchg_i32_param_2];
|
||||
; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -526,10 +526,10 @@ define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [atomic_cmpxchg_i64_param_0];
|
||||
; CHECK-NEXT: membar.sys;
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [atomic_cmpxchg_i64_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd3, [atomic_cmpxchg_i64_param_2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [atomic_cmpxchg_i64_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [atomic_cmpxchg_i64_param_2];
|
||||
; CHECK-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -7,8 +7,8 @@ declare void @llvm.nvvm.barrier.sync.cnt(i32, i32)
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}barrier_sync
|
||||
define void @barrier_sync(i32 %id, i32 %cnt) {
|
||||
; CHECK: ld.param.u32 [[ID:%r[0-9]+]], [barrier_sync_param_0];
|
||||
; CHECK: ld.param.u32 [[CNT:%r[0-9]+]], [barrier_sync_param_1];
|
||||
; CHECK: ld.param.b32 [[ID:%r[0-9]+]], [barrier_sync_param_0];
|
||||
; CHECK: ld.param.b32 [[CNT:%r[0-9]+]], [barrier_sync_param_1];
|
||||
|
||||
; CHECK: barrier.sync [[ID]], [[CNT]];
|
||||
call void @llvm.nvvm.barrier.sync.cnt(i32 %id, i32 %cnt)
|
||||
|
||||
@@ -22,10 +22,10 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) {
|
||||
; SM70-NEXT: .reg .b32 %f<4>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_fadd_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_fadd_param_1];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: ld.param.u16 %r3, [test_fadd_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r3, [test_fadd_param_0];
|
||||
; SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; SM70-NEXT: add.rn.f32 %f3, %f2, %f1;
|
||||
@@ -90,10 +90,10 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) {
|
||||
; SM70-NEXT: .reg .b32 %f<4>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_fsub_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_fsub_param_1];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: ld.param.u16 %r3, [test_fsub_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r3, [test_fsub_param_0];
|
||||
; SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1;
|
||||
@@ -569,10 +569,10 @@ define float @test_fpext_float(bfloat %a) #0 {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_fpext_float_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_fpext_float_param_0];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; SM70-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; SM70-NEXT: ret;
|
||||
;
|
||||
; SM80-LABEL: test_fpext_float(
|
||||
@@ -583,7 +583,7 @@ define float @test_fpext_float(bfloat %a) #0 {
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
|
||||
; SM80-NEXT: cvt.f32.bf16 %f1, %rs1;
|
||||
; SM80-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; SM80-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; SM80-NEXT: ret;
|
||||
;
|
||||
; SM80-FTZ-LABEL: test_fpext_float(
|
||||
@@ -594,7 +594,7 @@ define float @test_fpext_float(bfloat %a) #0 {
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1;
|
||||
; SM80-FTZ-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; SM80-FTZ-NEXT: ret;
|
||||
;
|
||||
; SM90-LABEL: test_fpext_float(
|
||||
@@ -605,7 +605,7 @@ define float @test_fpext_float(bfloat %a) #0 {
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0];
|
||||
; SM90-NEXT: cvt.f32.bf16 %f1, %rs1;
|
||||
; SM90-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; SM90-NEXT: ret;
|
||||
%r = fpext bfloat %a to float
|
||||
ret float %r
|
||||
@@ -620,7 +620,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM70-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM70-NEXT: mov.b32 %r1, %f1;
|
||||
; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
|
||||
; SM70-NEXT: add.s32 %r3, %r2, %r1;
|
||||
@@ -638,7 +638,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
|
||||
; SM80-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM80-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM80-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; SM80-NEXT: ret;
|
||||
@@ -649,7 +649,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
|
||||
; SM80-FTZ-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; SM80-FTZ-NEXT: ret;
|
||||
@@ -660,7 +660,7 @@ define bfloat @test_fptrunc_float(float %a) #0 {
|
||||
; SM90-NEXT: .reg .b32 %f<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0];
|
||||
; SM90-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -677,7 +677,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 {
|
||||
; SM70-NEXT: .reg .b32 %f<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_fadd_imm_1_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_fadd_imm_1_param_0];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000;
|
||||
@@ -738,8 +738,8 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat %
|
||||
; CHECK-NEXT: .reg .b64 %fd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [test_select_cc_bf16_f64_param_0];
|
||||
; CHECK-NEXT: ld.param.f64 %fd2, [test_select_cc_bf16_f64_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [test_select_cc_bf16_f64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd2, [test_select_cc_bf16_f64_param_1];
|
||||
; CHECK-NEXT: setp.lt.f64 %p1, %fd1, %fd2;
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2];
|
||||
; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3];
|
||||
@@ -760,7 +760,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2;
|
||||
@@ -790,8 +790,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM70-NEXT: cvt.u32.u16 %r19, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r20, %r19, 16;
|
||||
; SM70-NEXT: mov.b32 %f8, %r20;
|
||||
; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM70-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM70-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM70-NEXT: ret;
|
||||
;
|
||||
; SM80-LABEL: test_extload_bf16x8(
|
||||
@@ -802,7 +802,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM80-NEXT: .reg .b64 %rd<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2;
|
||||
@@ -816,8 +816,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM80-NEXT: cvt.f32.bf16 %f6, %rs3;
|
||||
; SM80-NEXT: cvt.f32.bf16 %f7, %rs2;
|
||||
; SM80-NEXT: cvt.f32.bf16 %f8, %rs1;
|
||||
; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM80-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM80-NEXT: ret;
|
||||
;
|
||||
; SM80-FTZ-LABEL: test_extload_bf16x8(
|
||||
@@ -828,7 +828,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM80-FTZ-NEXT: .reg .b64 %rd<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2;
|
||||
@@ -842,8 +842,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2;
|
||||
; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1;
|
||||
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM80-FTZ-NEXT: ret;
|
||||
;
|
||||
; SM90-LABEL: test_extload_bf16x8(
|
||||
@@ -854,7 +854,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0];
|
||||
; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2;
|
||||
@@ -868,8 +868,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 {
|
||||
; SM90-NEXT: cvt.f32.bf16 %f6, %rs3;
|
||||
; SM90-NEXT: cvt.f32.bf16 %f7, %rs2;
|
||||
; SM90-NEXT: cvt.f32.bf16 %f8, %rs1;
|
||||
; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM90-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5};
|
||||
; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1};
|
||||
; SM90-NEXT: ret;
|
||||
%load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16
|
||||
%res = fpext <8 x bfloat> %load to <8 x float>
|
||||
@@ -884,7 +884,7 @@ define i16 @test_fptosi_i16(bfloat %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_fptosi_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_fptosi_i16_param_0];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %f1;
|
||||
@@ -943,7 +943,7 @@ define i16 @test_fptoui_i16(bfloat %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_fptoui_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_fptoui_i16_param_0];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
|
||||
@@ -1003,7 +1003,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM70-NEXT: cvt.rn.f32.s16 %f1, %rs1;
|
||||
; SM70-NEXT: mov.b32 %r1, %f1;
|
||||
; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
|
||||
@@ -1022,7 +1022,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
|
||||
; SM80-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM80-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM80-NEXT: cvt.rn.f32.s16 %f1, %rs1;
|
||||
; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
|
||||
; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
@@ -1034,7 +1034,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
|
||||
; SM80-FTZ-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.rn.f32.s16 %f1, %rs1;
|
||||
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
@@ -1045,7 +1045,7 @@ define bfloat @test_sitofp_i16(i16 %a) {
|
||||
; SM90-NEXT: .reg .b16 %rs<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0];
|
||||
; SM90-NEXT: cvt.rn.bf16.s16 %rs2, %rs1;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1062,7 +1062,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; SM70-NEXT: mov.b32 %r1, %f1;
|
||||
; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
|
||||
@@ -1081,7 +1081,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
|
||||
; SM80-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM80-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
|
||||
; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
@@ -1093,7 +1093,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
|
||||
; SM80-FTZ-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
@@ -1104,7 +1104,7 @@ define bfloat @test_uitofp_i8(i8 %a) {
|
||||
; SM90-NEXT: .reg .b16 %rs<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0];
|
||||
; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1121,7 +1121,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM70-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; SM70-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; SM70-NEXT: selp.b32 %r1, 1, 0, %p1;
|
||||
@@ -1145,7 +1145,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
|
||||
; SM80-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM80-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM80-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; SM80-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; SM80-NEXT: selp.b32 %r1, 1, 0, %p1;
|
||||
@@ -1162,7 +1162,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
|
||||
; SM80-FTZ-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM80-FTZ-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; SM80-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; SM80-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1;
|
||||
@@ -1178,7 +1178,7 @@ define bfloat @test_uitofp_i1(i1 %a) {
|
||||
; SM90-NEXT: .reg .b32 %r<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0];
|
||||
; SM90-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; SM90-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; SM90-NEXT: selp.b32 %r1, 1, 0, %p1;
|
||||
@@ -1198,7 +1198,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; SM70-NEXT: mov.b32 %r1, %f1;
|
||||
; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
|
||||
@@ -1217,7 +1217,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
|
||||
; SM80-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM80-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
|
||||
; SM80-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
@@ -1229,7 +1229,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
|
||||
; SM80-FTZ-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
@@ -1240,7 +1240,7 @@ define bfloat @test_uitofp_i16(i16 %a) {
|
||||
; SM90-NEXT: .reg .b16 %rs<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0];
|
||||
; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1257,7 +1257,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM70-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM70-NEXT: cvt.rn.f32.u32 %f1, %r1;
|
||||
; SM70-NEXT: mov.b32 %r2, %f1;
|
||||
; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1;
|
||||
@@ -1277,7 +1277,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
|
||||
; SM80-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM80-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1;
|
||||
; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM80-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
@@ -1290,7 +1290,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
|
||||
; SM80-FTZ-NEXT: .reg .b32 %f<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1;
|
||||
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
@@ -1302,7 +1302,7 @@ define bfloat @test_uitofp_i32(i32 %a) {
|
||||
; SM90-NEXT: .reg .b32 %r<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0];
|
||||
; SM90-NEXT: cvt.rn.bf16.u32 %rs1, %r1;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1320,7 +1320,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM70-NEXT: cvt.rn.f32.u64 %f1, %rd1;
|
||||
; SM70-NEXT: mov.b32 %r1, %f1;
|
||||
; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1;
|
||||
@@ -1340,7 +1340,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
|
||||
; SM80-NEXT: .reg .b64 %rd<2>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM80-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM80-NEXT: cvt.rn.f32.u64 %f1, %rd1;
|
||||
; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM80-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
@@ -1353,7 +1353,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
|
||||
; SM80-FTZ-NEXT: .reg .b64 %rd<2>;
|
||||
; SM80-FTZ-EMPTY:
|
||||
; SM80-FTZ-NEXT: // %bb.0:
|
||||
; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM80-FTZ-NEXT: cvt.rn.f32.u64 %f1, %rd1;
|
||||
; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
@@ -1365,7 +1365,7 @@ define bfloat @test_uitofp_i64(i64 %a) {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0];
|
||||
; SM90-NEXT: cvt.rn.bf16.u64 %rs1, %rd1;
|
||||
; SM90-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1382,7 +1382,7 @@ define bfloat @test_roundeven(bfloat %a) {
|
||||
; SM70-NEXT: .reg .b32 %f<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_roundeven_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_roundeven_param_0];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: cvt.rni.f32.f32 %f2, %f1;
|
||||
@@ -1514,10 +1514,10 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) {
|
||||
; SM70-NEXT: .reg .b32 %f<4>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %r1, [test_maxnum_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r1, [test_maxnum_param_1];
|
||||
; SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; SM70-NEXT: ld.param.u16 %r3, [test_maxnum_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %r3, [test_maxnum_param_0];
|
||||
; SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; SM70-NEXT: max.f32 %f3, %f2, %f1;
|
||||
|
||||
@@ -26,8 +26,8 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou
|
||||
|
||||
define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
||||
; CHECK-LABEL: @test_bitcast_to_bfloat
|
||||
; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
|
||||
; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]]
|
||||
; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
|
||||
; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
|
||||
%val = load i16, ptr addrspace(1) %in
|
||||
%val_fp = bitcast i16 %val to bfloat
|
||||
store bfloat %val_fp, ptr addrspace(1) %out
|
||||
|
||||
@@ -157,7 +157,7 @@ define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_fneg_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_fneg_param_0];
|
||||
; CHECK-NEXT: xor.b32 %r2, %r1, -2147450880;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -172,9 +172,9 @@ define void @test_ldst_v2bf16(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2bf16_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2bf16_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2bf16_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2bf16_param_1];
|
||||
; CHECK-NEXT: st.b32 [%rd2], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%t1 = load <2 x bfloat>, ptr %a
|
||||
@@ -190,11 +190,11 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3bf16_param_0];
|
||||
; CHECK-NEXT: ld.u64 %rd2, [%rd1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3bf16_param_0];
|
||||
; CHECK-NEXT: ld.b64 %rd2, [%rd1];
|
||||
; CHECK-NEXT: mov.b64 {_, %r1}, %rd2;
|
||||
; CHECK-NEXT: ld.param.u64 %rd3, [test_ldst_v3bf16_param_1];
|
||||
; CHECK-NEXT: st.u32 [%rd3], %rd2;
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [test_ldst_v3bf16_param_1];
|
||||
; CHECK-NEXT: st.b32 [%rd3], %rd2;
|
||||
; CHECK-NEXT: mov.b32 {%rs1, _}, %r1;
|
||||
; CHECK-NEXT: st.b16 [%rd3+4], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -241,7 +241,7 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_1];
|
||||
@@ -315,7 +315,7 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
|
||||
; SM80-NEXT: .reg .b32 %f<11>;
|
||||
; SM80-EMPTY:
|
||||
; SM80-NEXT: // %bb.0:
|
||||
; SM80-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
|
||||
; SM80-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
|
||||
; SM80-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2];
|
||||
; SM80-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3];
|
||||
; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
@@ -326,10 +326,10 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
|
||||
; SM80-NEXT: cvt.f32.bf16 %f5, %rs2;
|
||||
; SM80-NEXT: cvt.f32.bf16 %f6, %rs4;
|
||||
; SM80-NEXT: setp.neu.f32 %p2, %f6, %f5;
|
||||
; SM80-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
|
||||
; SM80-NEXT: ld.param.v2.b32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1];
|
||||
; SM80-NEXT: selp.f32 %f9, %f2, %f8, %p2;
|
||||
; SM80-NEXT: selp.f32 %f10, %f1, %f7, %p1;
|
||||
; SM80-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
|
||||
; SM80-NEXT: st.param.v2.b32 [func_retval0], {%f10, %f9};
|
||||
; SM80-NEXT: ret;
|
||||
;
|
||||
; SM90-LABEL: test_select_cc_f32_bf16(
|
||||
@@ -339,14 +339,14 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b,
|
||||
; SM90-NEXT: .reg .b32 %f<7>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
|
||||
; SM90-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3];
|
||||
; SM90-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2];
|
||||
; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r2, %r1;
|
||||
; SM90-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
|
||||
; SM90-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1];
|
||||
; SM90-NEXT: selp.f32 %f5, %f2, %f4, %p2;
|
||||
; SM90-NEXT: selp.f32 %f6, %f1, %f3, %p1;
|
||||
; SM90-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
|
||||
; SM90-NEXT: st.param.v2.b32 [func_retval0], {%f6, %f5};
|
||||
; SM90-NEXT: ret;
|
||||
<2 x bfloat> %c, <2 x bfloat> %d) #0 {
|
||||
%cc = fcmp une <2 x bfloat> %c, %d
|
||||
@@ -365,8 +365,8 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b,
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3];
|
||||
; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3;
|
||||
; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4;
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
@@ -389,7 +389,7 @@ define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
|
||||
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f2, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -409,7 +409,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 {
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2;
|
||||
; CHECK-NEXT: cvt.f32.bf16 %f2, %rs1;
|
||||
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = fpext <2 x bfloat> %a to <2 x float>
|
||||
ret <2 x float> %r
|
||||
@@ -421,7 +421,7 @@ define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast <2 x bfloat> %a to <2 x i16>
|
||||
@@ -507,7 +507,7 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_fabs_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_fabs_param_0];
|
||||
; CHECK-NEXT: and.b32 %r2, %r1, 2147450879;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -12,7 +12,7 @@ define i32 @bfe0(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bfe0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bfe0_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 4, 4;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -27,7 +27,7 @@ define i32 @bfe1(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bfe1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bfe1_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 3, 3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -42,7 +42,7 @@ define i32 @bfe2(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bfe2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bfe2_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 5, 3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -57,7 +57,7 @@ define i32 @no_bfe_on_32bit_overflow(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [no_bfe_on_32bit_overflow_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [no_bfe_on_32bit_overflow_param_0];
|
||||
; CHECK-NEXT: shr.s32 %r2, %r1, 31;
|
||||
; CHECK-NEXT: and.b32 %r3, %r2, 15;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -73,7 +73,7 @@ define i32 @no_bfe_on_32bit_overflow_shr_and_pair(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0];
|
||||
; CHECK-NEXT: shr.s32 %r2, %r1, 31;
|
||||
; CHECK-NEXT: and.b32 %r3, %r2, 15;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -89,7 +89,7 @@ define i64 @no_bfe_on_64bit_overflow(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [no_bfe_on_64bit_overflow_param_0];
|
||||
; CHECK-NEXT: shr.s64 %rd2, %rd1, 63;
|
||||
; CHECK-NEXT: and.b64 %rd3, %rd2, 7;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
@@ -105,7 +105,7 @@ define i64 @no_bfe_on_64bit_overflow_shr_and_pair(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0];
|
||||
; CHECK-NEXT: shr.s64 %rd2, %rd1, 63;
|
||||
; CHECK-NEXT: and.b64 %rd3, %rd2, 7;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
@@ -121,7 +121,7 @@ define i32 @bfe_ashr_signed_32(i32 %x) {
|
||||
; CHECK-O3-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-O3-EMPTY:
|
||||
; CHECK-O3-NEXT: // %bb.0:
|
||||
; CHECK-O3-NEXT: ld.param.u16 %r1, [bfe_ashr_signed_32_param_0+2];
|
||||
; CHECK-O3-NEXT: ld.param.b16 %r1, [bfe_ashr_signed_32_param_0+2];
|
||||
; CHECK-O3-NEXT: bfe.s32 %r2, %r1, 4, 12;
|
||||
; CHECK-O3-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-O3-NEXT: ret;
|
||||
@@ -131,7 +131,7 @@ define i32 @bfe_ashr_signed_32(i32 %x) {
|
||||
; CHECK-O0-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-O0-EMPTY:
|
||||
; CHECK-O0-NEXT: // %bb.0:
|
||||
; CHECK-O0-NEXT: ld.param.u32 %r1, [bfe_ashr_signed_32_param_0];
|
||||
; CHECK-O0-NEXT: ld.param.b32 %r1, [bfe_ashr_signed_32_param_0];
|
||||
; CHECK-O0-NEXT: bfe.s32 %r2, %r1, 20, 12;
|
||||
; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-O0-NEXT: ret;
|
||||
@@ -146,7 +146,7 @@ define i32 @bfe_ashr_unsigned_32(i32 %x) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bfe_ashr_unsigned_32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bfe_ashr_unsigned_32_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 5, 6;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -161,7 +161,7 @@ define i64 @bfe_ashr_signed_64(i64 %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [bfe_ashr_signed_64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [bfe_ashr_signed_64_param_0];
|
||||
; CHECK-NEXT: bfe.s64 %rd2, %rd1, 16, 48;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -176,7 +176,7 @@ define i64 @bfe_ashr_unsigned_64(i64 %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [bfe_ashr_unsigned_64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [bfe_ashr_unsigned_64_param_0];
|
||||
; CHECK-NEXT: bfe.u64 %rd2, %rd1, 5, 6;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -192,7 +192,7 @@ define i32 @bfe3(i128 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [bfe3_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [bfe3_param_0];
|
||||
; CHECK-NEXT: cvt.u32.u64 %r1, %rd1;
|
||||
; CHECK-NEXT: bfe.s32 %r2, %r1, 15, 17;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
@@ -209,7 +209,7 @@ define i64 @bfe4(i128 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [bfe4_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [bfe4_param_0];
|
||||
; CHECK-NEXT: bfe.s64 %rd3, %rd1, 17, 47;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -10,8 +10,8 @@ define i32 @bmsk_wrap(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bmsk_wrap_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [bmsk_wrap_param_1];
|
||||
; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -25,8 +25,8 @@ define i32 @bmsk_clamp(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bmsk_clamp_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [bmsk_clamp_param_1];
|
||||
; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -54,7 +54,7 @@ define i32 @bmsk_clamp_ir(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bmsk_clamp_ir_param_0];
|
||||
; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -68,7 +68,7 @@ define i32 @bmsk_wrap_ri(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bmsk_wrap_ri_param_0];
|
||||
; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -14,7 +14,7 @@ define i16 @bswap16(i16 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [bswap16_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0];
|
||||
; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
|
||||
; CHECK-NEXT: shl.b16 %rs3, %rs1, 8;
|
||||
; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2;
|
||||
@@ -32,7 +32,7 @@ define i32 @bswap32(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bswap32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bswap32_param_0];
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -47,7 +47,7 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [bswapv2i16_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [bswapv2i16_param_0];
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 8961;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -62,7 +62,7 @@ define i64 @bswap64(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0];
|
||||
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; }
|
||||
; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 291;
|
||||
; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; }
|
||||
|
||||
@@ -15,10 +15,10 @@ entry:
|
||||
; CHECK: call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr %input)
|
||||
%b = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1
|
||||
%0 = load i32, ptr %b, align 4
|
||||
; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
|
||||
; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
|
||||
; PTX-NOT: ld.param.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
|
||||
; PTX: ld.param.b32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
|
||||
store i32 %0, ptr %output, align 4
|
||||
; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
|
||||
; PTX-NEXT: st.global.b32 [{{%rd[0-9]+}}], [[value]]
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
@@ -13,15 +13,15 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [_Z3foobbbPb_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [_Z3foobbbPb_param_0];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: ld.param.u8 %rs3, [_Z3foobbbPb_param_1];
|
||||
; CHECK-NEXT: ld.param.u8 %rs4, [_Z3foobbbPb_param_2];
|
||||
; CHECK-NEXT: ld.param.b8 %rs3, [_Z3foobbbPb_param_1];
|
||||
; CHECK-NEXT: ld.param.b8 %rs4, [_Z3foobbbPb_param_2];
|
||||
; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1;
|
||||
; CHECK-NEXT: and.b16 %rs6, %rs5, 1;
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [_Z3foobbbPb_param_3];
|
||||
; CHECK-NEXT: st.u8 [%rd1], %rs6;
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [_Z3foobbbPb_param_3];
|
||||
; CHECK-NEXT: st.b8 [%rd1], %rs6;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%.sink.v = select i1 %p1, i1 %p2, i1 %p3
|
||||
|
||||
@@ -15,7 +15,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p
|
||||
bb:
|
||||
%tmp5 = add nsw i64 %arg3, 8
|
||||
%tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5
|
||||
; CHECK: ld.global.nc.u16
|
||||
; CHECK: ld.global.nc.b16
|
||||
%tmp7 = load i16, ptr addrspace(1) %tmp6, align 2
|
||||
; CHECK: cvt.s32.s16
|
||||
%tmp8 = sext i16 %tmp7 to i64
|
||||
|
||||
@@ -10,7 +10,7 @@ target triple = "nvptx64-unknown-unknown"
|
||||
; CHECK-LABEL: ex_zext
|
||||
define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) {
|
||||
entry:
|
||||
; CHECK: ld.global.nc.u8
|
||||
; CHECK: ld.global.nc.b8
|
||||
%val = load i8, ptr %data
|
||||
; CHECK: cvt.u32.u8
|
||||
%valext = zext i8 %val to i32
|
||||
@@ -21,7 +21,7 @@ entry:
|
||||
; CHECK-LABEL: ex_sext
|
||||
define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) {
|
||||
entry:
|
||||
; CHECK: ld.global.nc.u8
|
||||
; CHECK: ld.global.nc.b8
|
||||
%val = load i8, ptr %data
|
||||
; CHECK: cvt.s32.s8
|
||||
%valext = sext i8 %val to i32
|
||||
@@ -32,7 +32,7 @@ entry:
|
||||
; CHECK-LABEL: ex_zext_v2
|
||||
define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) {
|
||||
entry:
|
||||
; CHECK: ld.global.nc.v2.u8
|
||||
; CHECK: ld.global.nc.v2.b8
|
||||
%val = load <2 x i8>, ptr %data
|
||||
; CHECK: cvt.u32.u16
|
||||
%valext = zext <2 x i8> %val to <2 x i32>
|
||||
@@ -43,7 +43,7 @@ entry:
|
||||
; CHECK-LABEL: ex_sext_v2
|
||||
define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) {
|
||||
entry:
|
||||
; CHECK: ld.global.nc.v2.u8
|
||||
; CHECK: ld.global.nc.v2.b8
|
||||
%val = load <2 x i8>, ptr %data
|
||||
; CHECK: cvt.s32.s8
|
||||
%valext = sext <2 x i8> %val to <2 x i32>
|
||||
|
||||
@@ -13,8 +13,8 @@ define void @foo() {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.global.u64 %rd1, [G];
|
||||
; CHECK-NEXT: ld.global.u64 %rd2, [G+8];
|
||||
; CHECK-NEXT: ld.global.b64 %rd1, [G];
|
||||
; CHECK-NEXT: ld.global.b64 %rd2, [G+8];
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 8 .b8 param0[16];
|
||||
; CHECK-NEXT: st.param.b64 [param0], %rd1;
|
||||
|
||||
@@ -23,11 +23,11 @@ entry:
|
||||
; CHECK: .local .align 4 .b8 __local_depot0[16]
|
||||
; CHECK: mov.b64 %SPL
|
||||
|
||||
; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
|
||||
; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0]
|
||||
; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]]
|
||||
; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0
|
||||
; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
|
||||
; CHECK: st.local.f32 [{{%rd[0-9]+}}], %f[[A0_REG]]
|
||||
; CHECK: ld.global.b32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]]
|
||||
; CHECK: st.local.b32 [{{%rd[0-9]+}}], %f[[A0_REG]]
|
||||
|
||||
%0 = load float, ptr %a, align 4
|
||||
store float %0, ptr %buf, align 4
|
||||
|
||||
@@ -9,8 +9,8 @@ define i64 @test() nounwind readnone {
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: mov.b64 %rd1, 1;
|
||||
; CHECK-NEXT: mov.b64 %rd2, 42;
|
||||
; CHECK-NEXT: st.u64 [%rd1], %rd2;
|
||||
; CHECK-NEXT: ld.global.u64 %rd3, [%rd1];
|
||||
; CHECK-NEXT: st.b64 [%rd1], %rd2;
|
||||
; CHECK-NEXT: ld.global.b64 %rd3, [%rd1];
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; CHECK-NEXT: ret;
|
||||
%addr0 = inttoptr i64 1 to ptr
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -18,8 +18,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
|
||||
; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
; SM30-NEXT: and.b32 %r10, %r9, 3;
|
||||
@@ -30,9 +30,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM30-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
|
||||
; SM30-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1];
|
||||
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM30-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -59,8 +59,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
; SM70-NEXT: and.b32 %r10, %r9, 3;
|
||||
@@ -71,9 +71,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM70-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
|
||||
; SM70-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1];
|
||||
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM70-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -99,8 +99,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
; SM90-NEXT: and.b32 %r10, %r9, 3;
|
||||
@@ -111,9 +111,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM90-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
|
||||
; SM90-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1];
|
||||
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM90-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -144,8 +144,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0];
|
||||
; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
; SM30-NEXT: and.b32 %r10, %r9, 3;
|
||||
@@ -156,9 +156,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM30-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1];
|
||||
; SM30-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1];
|
||||
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM30-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -186,8 +186,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
; SM70-NEXT: and.b32 %r10, %r9, 3;
|
||||
@@ -198,9 +198,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM70-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1];
|
||||
; SM70-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1];
|
||||
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM70-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -227,8 +227,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
; SM90-NEXT: and.b32 %r10, %r9, 3;
|
||||
@@ -239,9 +239,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM90-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1];
|
||||
; SM90-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1];
|
||||
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM90-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -273,8 +273,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0];
|
||||
; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -286,9 +286,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM30-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1];
|
||||
; SM30-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1];
|
||||
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM30-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -315,8 +315,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0];
|
||||
; SM70-NEXT: fence.acq_rel.sys;
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -328,9 +328,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM70-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1];
|
||||
; SM70-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1];
|
||||
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM70-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -356,8 +356,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0];
|
||||
; SM90-NEXT: fence.release.sys;
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -369,9 +369,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM90-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1];
|
||||
; SM90-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1];
|
||||
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM90-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -402,8 +402,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
|
||||
; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -415,9 +415,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM30-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
|
||||
; SM30-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
|
||||
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM30-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -445,8 +445,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
|
||||
; SM70-NEXT: fence.acq_rel.sys;
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -458,9 +458,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM70-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
|
||||
; SM70-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
|
||||
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM70-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -487,8 +487,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0];
|
||||
; SM90-NEXT: fence.release.sys;
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -500,9 +500,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM90-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1];
|
||||
; SM90-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1];
|
||||
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM90-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -534,8 +534,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
|
||||
; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -547,9 +547,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM30-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
|
||||
; SM30-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
|
||||
; SM30-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM30-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -577,8 +577,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
|
||||
; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
|
||||
; SM70-NEXT: fence.sc.sys;
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -590,9 +590,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM70-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
|
||||
; SM70-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
|
||||
; SM70-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM70-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -619,8 +619,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0];
|
||||
; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0];
|
||||
; SM90-NEXT: fence.sc.sys;
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r9, %rd2;
|
||||
@@ -632,9 +632,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r13, %rs1;
|
||||
; SM90-NEXT: and.b32 %r14, %r13, 255;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1];
|
||||
; SM90-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1];
|
||||
; SM90-NEXT: shl.b32 %r4, %r15, %r1;
|
||||
; SM90-NEXT: ld.u32 %r16, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r16, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r20, %r16, %r2;
|
||||
; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -667,10 +667,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
|
||||
; SM30-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
|
||||
; SM30-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1];
|
||||
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM30-NEXT: and.b32 %r11, %r10, 3;
|
||||
; SM30-NEXT: shl.b32 %r1, %r11, 3;
|
||||
@@ -680,7 +680,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM30-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -707,10 +707,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1];
|
||||
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM70-NEXT: and.b32 %r11, %r10, 3;
|
||||
; SM70-NEXT: shl.b32 %r1, %r11, 3;
|
||||
@@ -720,7 +720,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM70-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -746,10 +746,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
|
||||
; SM90-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1];
|
||||
; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM90-NEXT: and.b32 %r11, %r10, 3;
|
||||
; SM90-NEXT: shl.b32 %r1, %r11, 3;
|
||||
@@ -759,7 +759,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM90-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -790,10 +790,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0];
|
||||
; SM30-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1];
|
||||
; SM30-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1];
|
||||
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM30-NEXT: and.b32 %r11, %r10, 3;
|
||||
; SM30-NEXT: shl.b32 %r1, %r11, 3;
|
||||
@@ -803,7 +803,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM30-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -831,10 +831,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1];
|
||||
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM70-NEXT: and.b32 %r11, %r10, 3;
|
||||
; SM70-NEXT: shl.b32 %r1, %r11, 3;
|
||||
@@ -844,7 +844,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM70-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -871,10 +871,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1];
|
||||
; SM90-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1];
|
||||
; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM90-NEXT: and.b32 %r11, %r10, 3;
|
||||
; SM90-NEXT: shl.b32 %r1, %r11, 3;
|
||||
@@ -884,7 +884,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM90-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -916,10 +916,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0];
|
||||
; SM30-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1];
|
||||
; SM30-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM30-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -930,7 +930,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM30-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -957,10 +957,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0];
|
||||
; SM70-NEXT: fence.acq_rel.sys;
|
||||
; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM70-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -971,7 +971,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM70-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -997,10 +997,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0];
|
||||
; SM90-NEXT: fence.release.sys;
|
||||
; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1];
|
||||
; SM90-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM90-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1011,7 +1011,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM90-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1042,10 +1042,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
|
||||
; SM30-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
|
||||
; SM30-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM30-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1056,7 +1056,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM30-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1084,10 +1084,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
|
||||
; SM70-NEXT: fence.acq_rel.sys;
|
||||
; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM70-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1098,7 +1098,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM70-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1125,10 +1125,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0];
|
||||
; SM90-NEXT: fence.release.sys;
|
||||
; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1];
|
||||
; SM90-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM90-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1139,7 +1139,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM90-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1172,10 +1172,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<3>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
|
||||
; SM30-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
|
||||
; SM30-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
|
||||
; SM30-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM30-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM30-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1186,7 +1186,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM30-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM30-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM30-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM30-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM30-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM30-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
|
||||
; SM30-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1214,10 +1214,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<3>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
|
||||
; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
|
||||
; SM70-NEXT: fence.sc.sys;
|
||||
; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
|
||||
; SM70-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
|
||||
; SM70-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM70-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1228,7 +1228,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM70-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM70-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM70-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM70-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM70-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
|
||||
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1255,10 +1255,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<3>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0];
|
||||
; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0];
|
||||
; SM90-NEXT: fence.sc.sys;
|
||||
; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1];
|
||||
; SM90-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1];
|
||||
; SM90-NEXT: and.b64 %rd1, %rd2, -4;
|
||||
; SM90-NEXT: cvt.u32.u64 %r10, %rd2;
|
||||
; SM90-NEXT: and.b32 %r11, %r10, 3;
|
||||
@@ -1269,7 +1269,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
|
||||
; SM90-NEXT: cvt.u32.u16 %r14, %rs1;
|
||||
; SM90-NEXT: shl.b32 %r3, %r14, %r1;
|
||||
; SM90-NEXT: shl.b32 %r4, %r9, %r1;
|
||||
; SM90-NEXT: ld.u32 %r15, [%rd1];
|
||||
; SM90-NEXT: ld.b32 %r15, [%rd1];
|
||||
; SM90-NEXT: and.b32 %r19, %r15, %r2;
|
||||
; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop
|
||||
; SM90-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -1300,9 +1300,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<2>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2];
|
||||
; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM30-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1313,9 +1313,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2];
|
||||
; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1325,9 +1325,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2];
|
||||
; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1342,9 +1342,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<2>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
|
||||
; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM30-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1355,9 +1355,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
|
||||
; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1367,9 +1367,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2];
|
||||
; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1384,9 +1384,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<2>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2];
|
||||
; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM30-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1397,9 +1397,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2];
|
||||
; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1409,9 +1409,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2];
|
||||
; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1426,9 +1426,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<2>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2];
|
||||
; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM30-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1439,9 +1439,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2];
|
||||
; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1451,9 +1451,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2];
|
||||
; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1468,10 +1468,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<2>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
|
||||
; SM30-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
|
||||
; SM30-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
|
||||
; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM30-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1482,10 +1482,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<2>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
|
||||
; SM70-NEXT: fence.sc.sys;
|
||||
; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
|
||||
; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
|
||||
; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
|
||||
; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1495,10 +1495,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<2>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0];
|
||||
; SM90-NEXT: fence.sc.sys;
|
||||
; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2];
|
||||
; SM90-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1];
|
||||
; SM90-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2];
|
||||
; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
|
||||
; SM90-NEXT: fence.acquire.sys;
|
||||
; SM90-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
@@ -1514,9 +1514,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<5>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
|
||||
; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1526,9 +1526,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<5>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
|
||||
; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1537,9 +1537,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<5>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2];
|
||||
; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1553,9 +1553,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<5>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2];
|
||||
; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1565,9 +1565,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<5>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2];
|
||||
; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1576,9 +1576,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<5>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2];
|
||||
; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1592,9 +1592,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<5>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
|
||||
; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1604,9 +1604,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<5>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
|
||||
; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1615,9 +1615,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<5>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2];
|
||||
; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1631,9 +1631,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<5>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2];
|
||||
; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1643,9 +1643,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<5>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2];
|
||||
; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1654,9 +1654,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<5>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2];
|
||||
; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM90-NEXT: ret;
|
||||
@@ -1670,10 +1670,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM30-NEXT: .reg .b64 %rd<5>;
|
||||
; SM30-EMPTY:
|
||||
; SM30-NEXT: // %bb.0:
|
||||
; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
|
||||
; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
|
||||
; SM30-NEXT: membar.sys;
|
||||
; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
|
||||
; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
|
||||
; SM30-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
|
||||
; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM30-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM30-NEXT: ret;
|
||||
@@ -1683,10 +1683,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM70-NEXT: .reg .b64 %rd<5>;
|
||||
; SM70-EMPTY:
|
||||
; SM70-NEXT: // %bb.0:
|
||||
; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
|
||||
; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
|
||||
; SM70-NEXT: fence.sc.sys;
|
||||
; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
|
||||
; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
|
||||
; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
|
||||
; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
; SM70-NEXT: ret;
|
||||
@@ -1695,10 +1695,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
|
||||
; SM90-NEXT: .reg .b64 %rd<5>;
|
||||
; SM90-EMPTY:
|
||||
; SM90-NEXT: // %bb.0:
|
||||
; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0];
|
||||
; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0];
|
||||
; SM90-NEXT: fence.sc.sys;
|
||||
; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2];
|
||||
; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1];
|
||||
; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2];
|
||||
; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
|
||||
; SM90-NEXT: fence.acquire.sys;
|
||||
; SM90-NEXT: st.param.b64 [func_retval0], %rd3;
|
||||
|
||||
@@ -11,8 +11,8 @@ define i32 @test1(i32 %n, i32 %m) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test1_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test1_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test1_param_1];
|
||||
; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -28,8 +28,8 @@ define i32 @test1_rev(i32 %n, i32 %m) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test1_rev_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test1_rev_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test1_rev_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test1_rev_param_1];
|
||||
; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -47,9 +47,9 @@ define i32 @test2(i32 %n, i32 %m, i32 %s) {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test2_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test2_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test2_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test2_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test2_param_2];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1;
|
||||
; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2;
|
||||
; CHECK-NEXT: selp.b32 %r5, %r2, %r4, %p1;
|
||||
@@ -71,9 +71,9 @@ define i32 @test2_rev1(i32 %n, i32 %m, i32 %s) {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test2_rev1_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test2_rev1_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test2_rev1_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test2_rev1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test2_rev1_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test2_rev1_param_2];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1;
|
||||
; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2;
|
||||
; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1;
|
||||
@@ -95,9 +95,9 @@ define i32 @test2_rev2(i32 %n, i32 %m, i32 %s) {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test2_rev2_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test2_rev2_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test2_rev2_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test2_rev2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test2_rev2_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test2_rev2_param_2];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1;
|
||||
; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2;
|
||||
; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1;
|
||||
@@ -119,10 +119,10 @@ define i32 @test3(i32 %n, i32 %m, i32 %s) {
|
||||
; CHECK-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test3_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test3_param_0];
|
||||
; CHECK-NEXT: add.s32 %r2, %r1, 3;
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test3_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test3_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test3_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test3_param_2];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r4, 1;
|
||||
; CHECK-NEXT: selp.b32 %r5, 1, %r2, %p1;
|
||||
; CHECK-NEXT: mul.lo.s32 %r6, %r5, %r3;
|
||||
@@ -144,12 +144,12 @@ define i32 @test4(i32 %a, i32 %b, i32 %c, i1 %p) {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test4_param_3];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test4_param_3];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test4_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test4_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test4_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test4_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test4_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test4_param_2];
|
||||
; CHECK-NEXT: mad.lo.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: selp.b32 %r5, %r4, %r3, %p1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
|
||||
@@ -168,12 +168,12 @@ define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test4_rev_param_3];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test4_rev_param_3];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test4_rev_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test4_rev_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test4_rev_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test4_rev_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test4_rev_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test4_rev_param_2];
|
||||
; CHECK-NEXT: mad.lo.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: selp.b32 %r5, %r3, %r4, %p1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r5;
|
||||
@@ -192,10 +192,10 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_mad_multi_use_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_mad_multi_use_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_mad_multi_use_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_mad_multi_use_param_1];
|
||||
; CHECK-NEXT: mul.lo.s32 %r3, %r1, %r2;
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test_mad_multi_use_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test_mad_multi_use_param_2];
|
||||
; CHECK-NEXT: add.s32 %r5, %r3, %r4;
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .b32 param0;
|
||||
@@ -227,7 +227,7 @@ define i32 @test_mad_fold(i32 %x) {
|
||||
; CHECK-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_mad_fold_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_mad_fold_param_0];
|
||||
; CHECK-NEXT: mul.hi.s32 %r2, %r1, -2147221471;
|
||||
; CHECK-NEXT: add.s32 %r3, %r2, %r1;
|
||||
; CHECK-NEXT: shr.u32 %r4, %r3, 31;
|
||||
|
||||
@@ -11,7 +11,7 @@ define i8 @cvt_u8_f32(float %x) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_u8_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_u8_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -28,7 +28,7 @@ define i8 @cvt_u8_f64(double %x) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_u8_f64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [cvt_u8_f64_param_0];
|
||||
; CHECK-NEXT: cvt.rzi.u16.f64 %rs1, %fd1;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -44,9 +44,9 @@ define float @cvt_f32_i8(i8 %x) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f32_i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [cvt_f32_i8_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f32.u16 %f1, %rs1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: ret;
|
||||
%a = uitofp i8 %x to float
|
||||
ret float %a
|
||||
@@ -59,9 +59,9 @@ define double @cvt_f64_i8(i8 %x) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f64_i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [cvt_f64_i8_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f64.u16 %fd1, %rs1;
|
||||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %fd1;
|
||||
; CHECK-NEXT: ret;
|
||||
%a = uitofp i8 %x to double
|
||||
ret double %a
|
||||
@@ -76,7 +76,7 @@ define float @cvt_f32_s8(i8 %x) {
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f32_s8_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f32.s16 %f1, %rs1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: ret;
|
||||
%a = sitofp i8 %x to float
|
||||
ret float %a
|
||||
@@ -91,7 +91,7 @@ define double @cvt_f64_s8(i8 %x) {
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f64_s8_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f64.s16 %fd1, %rs1;
|
||||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd1;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %fd1;
|
||||
; CHECK-NEXT: ret;
|
||||
%a = sitofp i8 %x to double
|
||||
ret double %a
|
||||
@@ -105,7 +105,7 @@ define i8 @cvt_s8_f32(float %x) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_s8_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_s8_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rzi.s16.f32 %rs1, %f1;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: and.b32 %r2, %r1, 255;
|
||||
@@ -123,7 +123,7 @@ define i8 @cvt_s8_f64(double %x) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [cvt_s8_f64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [cvt_s8_f64_param_0];
|
||||
; CHECK-NEXT: cvt.rzi.s16.f64 %rs1, %fd1;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: and.b32 %r2, %r1, 255;
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
; i16
|
||||
|
||||
define i16 @cvt_i16_i32(i32 %x) {
|
||||
; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}]
|
||||
; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}]
|
||||
; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
|
||||
; CHECK: ret
|
||||
%a = trunc i32 %x to i16
|
||||
@@ -18,7 +18,7 @@ define i16 @cvt_i16_i32(i32 %x) {
|
||||
}
|
||||
|
||||
define i16 @cvt_i16_i64(i64 %x) {
|
||||
; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}]
|
||||
; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}]
|
||||
; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
|
||||
; CHECK: ret
|
||||
%a = trunc i64 %x to i16
|
||||
@@ -30,7 +30,7 @@ define i16 @cvt_i16_i64(i64 %x) {
|
||||
; i32
|
||||
|
||||
define i32 @cvt_i32_i16(i16 %x) {
|
||||
; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}]
|
||||
; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}]
|
||||
; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
|
||||
; CHECK: ret
|
||||
%a = zext i16 %x to i32
|
||||
@@ -38,7 +38,7 @@ define i32 @cvt_i32_i16(i16 %x) {
|
||||
}
|
||||
|
||||
define i32 @cvt_i32_i64(i64 %x) {
|
||||
; CHECK: ld.param.u32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}]
|
||||
; CHECK: ld.param.b32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}]
|
||||
; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]]
|
||||
; CHECK: ret
|
||||
%a = trunc i64 %x to i32
|
||||
@@ -50,7 +50,7 @@ define i32 @cvt_i32_i64(i64 %x) {
|
||||
; i64
|
||||
|
||||
define i64 @cvt_i64_i16(i16 %x) {
|
||||
; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
|
||||
; CHECK: ld.param.b16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}]
|
||||
; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]]
|
||||
; CHECK: ret
|
||||
%a = zext i16 %x to i64
|
||||
@@ -58,7 +58,7 @@ define i64 @cvt_i64_i16(i16 %x) {
|
||||
}
|
||||
|
||||
define i64 @cvt_i64_i32(i32 %x) {
|
||||
; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
|
||||
; CHECK: ld.param.b32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}]
|
||||
; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]]
|
||||
; CHECK: ret
|
||||
%a = zext i32 %x to i64
|
||||
|
||||
@@ -14,7 +14,7 @@ define i32 @cvt_rn_satf_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.satfinite.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -29,7 +29,7 @@ define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.relu.satfinite.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -44,7 +44,7 @@ define i32 @cvt_rz_satf_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rz.satfinite.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -59,7 +59,7 @@ define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rz.relu.satfinite.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -14,8 +14,8 @@ define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.satfinite.e2m3x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -32,8 +32,8 @@ define i16 @cvt_rn_relu_sf_e2m3x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -50,8 +50,8 @@ define i16 @cvt_rn_sf_e3m2x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.satfinite.e3m2x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -68,8 +68,8 @@ define i16 @cvt_rn_relu_sf_e3m2x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -85,7 +85,7 @@ define <2 x half> @cvt_rn_f16x2_e2m3x2(i16 %in) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16x2.e2m3x2 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -100,7 +100,7 @@ define <2 x half> @cvt_rn_relu_f16x2_e2m3x2_relu(i16 %in) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0];
|
||||
; CHECK-NEXT: cvt.rn.relu.f16x2.e2m3x2 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -115,7 +115,7 @@ define <2 x half> @cvt_rn_f16x2_e3m2x2(i16 %in) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16x2.e3m2x2 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -130,7 +130,7 @@ define <2 x half> @cvt_rn_relu_f16x2_e3m2x2(i16 %in) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0];
|
||||
; CHECK-NEXT: cvt.rn.relu.f16x2.e3m2x2 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -146,8 +146,8 @@ define i16 @cvt_rz_ue8m0x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rz.ue8m0x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -164,8 +164,8 @@ define i16 @cvt_rz_sf_ue8m0x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rz.satfinite.ue8m0x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -182,8 +182,8 @@ define i16 @cvt_rp_ue8m0x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rp_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rp_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rp.ue8m0x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -200,8 +200,8 @@ define i16 @cvt_rp_sf_ue8m0x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rp.satfinite.ue8m0x2.f32 %rs1, %f1, %f2;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -281,7 +281,7 @@ define <2 x bfloat> @cvt_bf16x2_ue8m0x2(i16 %in) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [cvt_bf16x2_ue8m0x2_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_ue8m0x2_param_0];
|
||||
; CHECK-NEXT: cvt.rn.bf16x2.ue8m0x2 %r1, %rs1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -10,8 +10,8 @@ define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -26,8 +26,8 @@ define <2 x bfloat> @cvt_rn_relu_bf16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.relu.bf16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -42,8 +42,8 @@ define <2 x bfloat> @cvt_rz_bf16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rz.bf16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -58,8 +58,8 @@ define <2 x bfloat> @cvt_rz_relu_bf16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_relu_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_bf16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_relu_bf16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rz.relu.bf16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -79,8 +79,8 @@ define <2 x half> @cvt_rn_f16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.f16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -95,8 +95,8 @@ define <2 x half> @cvt_rn_relu_f16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rn.relu.f16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -111,8 +111,8 @@ define <2 x half> @cvt_rz_f16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rz.f16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -127,8 +127,8 @@ define <2 x half> @cvt_rz_relu_f16x2_f32(float %f1, float %f2) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_relu_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_f16x2_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_relu_f16x2_f32_param_1];
|
||||
; CHECK-NEXT: cvt.rz.relu.f16x2.f32 %r1, %f1, %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -148,7 +148,7 @@ define bfloat @cvt_rn_bf16_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_bf16_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_bf16_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.bf16.f32 %rs1, %f1;
|
||||
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -163,7 +163,7 @@ define bfloat @cvt_rn_relu_bf16_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_bf16_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_bf16_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.relu.bf16.f32 %rs1, %f1;
|
||||
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -178,7 +178,7 @@ define bfloat @cvt_rz_bf16_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_bf16_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_bf16_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rz.bf16.f32 %rs1, %f1;
|
||||
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -193,7 +193,7 @@ define bfloat @cvt_rz_relu_bf16_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_bf16_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_bf16_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rz.relu.bf16.f32 %rs1, %f1;
|
||||
; CHECK-NEXT: st.param.b16 [func_retval0], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -213,7 +213,7 @@ define i32 @cvt_rna_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rna_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rna_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rna.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -231,8 +231,8 @@ define <2 x bfloat> @fold_ff2bf16x2(float %lo, float %hi) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fold_ff2bf16x2_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [fold_ff2bf16x2_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fold_ff2bf16x2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [fold_ff2bf16x2_param_1];
|
||||
; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f2, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -250,8 +250,8 @@ define <2 x half> @fold_ff2f16x2(float %lo, float %hi) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fold_ff2f16x2_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [fold_ff2f16x2_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fold_ff2f16x2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [fold_ff2f16x2_param_1];
|
||||
; CHECK-NEXT: cvt.rn.f16x2.f32 %r1, %f2, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -14,7 +14,7 @@ define i32 @cvt_rn_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -29,7 +29,7 @@ define i32 @cvt_rn_relu_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.relu.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -44,7 +44,7 @@ define i32 @cvt_rz_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rz.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -59,7 +59,7 @@ define i32 @cvt_rz_relu_tf32_f32(float %f1) {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_tf32_f32_param_0];
|
||||
; CHECK-NEXT: cvt.rz.relu.tf32.f32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -11,10 +11,10 @@ define float @fcopysign_f_f(float %a, float %b) {
|
||||
; CHECK-NEXT: .reg .b32 %f<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_f_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [fcopysign_f_f_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fcopysign_f_f_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [fcopysign_f_f_param_1];
|
||||
; CHECK-NEXT: copysign.f32 %f3, %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f3;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call float @llvm.copysign.f32(float %a, float %b)
|
||||
ret float %val
|
||||
@@ -26,10 +26,10 @@ define double @fcopysign_d_d(double %a, double %b) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_d_param_0];
|
||||
; CHECK-NEXT: ld.param.f64 %fd2, [fcopysign_d_d_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [fcopysign_d_d_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd2, [fcopysign_d_d_param_1];
|
||||
; CHECK-NEXT: copysign.f64 %fd3, %fd2, %fd1;
|
||||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd3;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %fd3;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call double @llvm.copysign.f64(double %a, double %b)
|
||||
ret double %val
|
||||
@@ -43,15 +43,15 @@ define float @fcopysign_f_d(float %a, double %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_d_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fcopysign_f_d_param_0];
|
||||
; CHECK-NEXT: abs.f32 %f2, %f1;
|
||||
; CHECK-NEXT: neg.f32 %f3, %f2;
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [fcopysign_f_d_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [fcopysign_f_d_param_1];
|
||||
; CHECK-NEXT: shr.u64 %rd2, %rd1, 63;
|
||||
; CHECK-NEXT: and.b64 %rd3, %rd2, 1;
|
||||
; CHECK-NEXT: setp.ne.b64 %p1, %rd3, 0;
|
||||
; CHECK-NEXT: selp.f32 %f4, %f3, %f2, %p1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f4;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f4;
|
||||
; CHECK-NEXT: ret;
|
||||
%c = fptrunc double %b to float
|
||||
%val = call float @llvm.copysign.f32(float %a, float %c)
|
||||
@@ -66,15 +66,15 @@ define float @fcopysign_f_h(float %a, half %b) {
|
||||
; CHECK-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_h_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fcopysign_f_h_param_0];
|
||||
; CHECK-NEXT: abs.f32 %f2, %f1;
|
||||
; CHECK-NEXT: neg.f32 %f3, %f2;
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [fcopysign_f_h_param_1];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [fcopysign_f_h_param_1];
|
||||
; CHECK-NEXT: shr.u16 %rs2, %rs1, 15;
|
||||
; CHECK-NEXT: and.b16 %rs3, %rs2, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs3, 0;
|
||||
; CHECK-NEXT: selp.f32 %f4, %f3, %f2, %p1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f4;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f4;
|
||||
; CHECK-NEXT: ret;
|
||||
%c = fpext half %b to float
|
||||
%val = call float @llvm.copysign.f32(float %a, float %c)
|
||||
@@ -89,15 +89,15 @@ define double @fcopysign_d_f(double %a, float %b) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_f_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [fcopysign_d_f_param_0];
|
||||
; CHECK-NEXT: abs.f64 %fd2, %fd1;
|
||||
; CHECK-NEXT: neg.f64 %fd3, %fd2;
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [fcopysign_d_f_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [fcopysign_d_f_param_1];
|
||||
; CHECK-NEXT: shr.u32 %r2, %r1, 31;
|
||||
; CHECK-NEXT: and.b32 %r3, %r2, 1;
|
||||
; CHECK-NEXT: setp.ne.b32 %p1, %r3, 0;
|
||||
; CHECK-NEXT: selp.f64 %fd4, %fd3, %fd2, %p1;
|
||||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd4;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %fd4;
|
||||
; CHECK-NEXT: ret;
|
||||
%c = fpext float %b to double
|
||||
%val = call double @llvm.copysign.f64(double %a, double %c)
|
||||
@@ -112,15 +112,15 @@ define double @fcopysign_d_h(double %a, half %b) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_h_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [fcopysign_d_h_param_0];
|
||||
; CHECK-NEXT: abs.f64 %fd2, %fd1;
|
||||
; CHECK-NEXT: neg.f64 %fd3, %fd2;
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [fcopysign_d_h_param_1];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [fcopysign_d_h_param_1];
|
||||
; CHECK-NEXT: shr.u16 %rs2, %rs1, 15;
|
||||
; CHECK-NEXT: and.b16 %rs3, %rs2, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs3, 0;
|
||||
; CHECK-NEXT: selp.f64 %fd4, %fd3, %fd2, %p1;
|
||||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd4;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %fd4;
|
||||
; CHECK-NEXT: ret;
|
||||
%c = fpext half %b to double
|
||||
%val = call double @llvm.copysign.f64(double %a, double %c)
|
||||
|
||||
@@ -25,14 +25,14 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -44,14 +44,14 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -74,15 +74,15 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -94,15 +94,15 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -125,16 +125,16 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -146,16 +146,16 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -178,17 +178,17 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -200,17 +200,17 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -233,18 +233,18 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -256,18 +256,18 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -290,17 +290,17 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1};
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -312,17 +312,17 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1};
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -345,19 +345,19 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2};
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -369,19 +369,19 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2};
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -404,21 +404,21 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3};
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -430,21 +430,21 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3};
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
|
||||
@@ -22,10 +22,10 @@ define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch)
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2;
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
@@ -41,11 +41,11 @@ define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2;
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
@@ -62,14 +62,14 @@ define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2;
|
||||
; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1};
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1}, %rd2;
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
@@ -90,16 +90,16 @@ define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2;
|
||||
; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7];
|
||||
; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2};
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2;
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
@@ -120,18 +120,18 @@ define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2;
|
||||
; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7];
|
||||
; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8];
|
||||
; CHECK-PTX-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9];
|
||||
; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7];
|
||||
; CHECK-PTX-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8];
|
||||
; CHECK-PTX-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9];
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3};
|
||||
; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}, %rd2;
|
||||
; CHECK-PTX-NEXT: ret;
|
||||
|
||||
@@ -22,10 +22,10 @@ define void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tma
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
@@ -71,11 +71,11 @@ define void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tma
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
@@ -121,12 +121,12 @@ define void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tma
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
@@ -172,13 +172,13 @@ define void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tma
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
@@ -224,14 +224,14 @@ define void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tma
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
@@ -277,12 +277,12 @@ define void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %t
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
@@ -328,13 +328,13 @@ define void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %t
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
@@ -380,14 +380,14 @@ define void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %t
|
||||
; CHECK-PTX-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX-EMPTY:
|
||||
; CHECK-PTX-NEXT: // %bb.0:
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5];
|
||||
; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6];
|
||||
; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7];
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
|
||||
@@ -24,11 +24,11 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap,
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
@@ -38,11 +38,11 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap,
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0)
|
||||
@@ -58,12 +58,12 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src,
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
@@ -73,12 +73,12 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src,
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0)
|
||||
@@ -94,13 +94,13 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3;
|
||||
@@ -112,13 +112,13 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr %
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2;
|
||||
@@ -139,14 +139,14 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3;
|
||||
@@ -158,14 +158,14 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr %
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2;
|
||||
@@ -186,15 +186,15 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3;
|
||||
@@ -206,15 +206,15 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr %
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;
|
||||
|
||||
@@ -19,14 +19,14 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_g2s_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_g2s_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_g2s_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_g2s_param_5];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_5];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4;
|
||||
; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4];
|
||||
; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1;
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
@@ -38,14 +38,14 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_g2s_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_g2s_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_g2s_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_g2s_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_5];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1;
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
@@ -63,11 +63,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1;
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_s2g_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
@@ -77,11 +77,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_s2g_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2;
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0)
|
||||
@@ -96,10 +96,10 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_cta_to_cluster_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_cta_to_cluster_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_cta_to_cluster_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_3];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_cta_to_cluster_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_cta_to_cluster_param_1];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_cta_to_cluster_param_2];
|
||||
; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_cta_to_cluster_param_3];
|
||||
; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2];
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
;
|
||||
@@ -108,10 +108,10 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3
|
||||
; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-PTX-SHARED32-EMPTY:
|
||||
; CHECK-PTX-SHARED32-NEXT: // %bb.0:
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_cta_to_cluster_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_cta_to_cluster_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_cta_to_cluster_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_cta_to_cluster_param_0];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_cta_to_cluster_param_1];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_cta_to_cluster_param_2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_cta_to_cluster_param_3];
|
||||
; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%r3], [%r1], %r4, [%r2];
|
||||
; CHECK-PTX-SHARED32-NEXT: ret;
|
||||
tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size)
|
||||
@@ -125,9 +125,9 @@ define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [cp_async_bulk_prefetch_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [cp_async_bulk_prefetch_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [cp_async_bulk_prefetch_param_2];
|
||||
; CHECK-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2;
|
||||
; CHECK-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -17,7 +17,7 @@ define i32 @myctlz(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [myctlz_param_0];
|
||||
; CHECK-NEXT: clz.b32 %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -30,7 +30,7 @@ define i32 @myctlz_2(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [myctlz_2_param_0];
|
||||
; CHECK-NEXT: clz.b32 %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -47,7 +47,7 @@ define i64 @myctlz64(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_param_0];
|
||||
; CHECK-NEXT: clz.b64 %r1, %rd1;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
@@ -62,7 +62,7 @@ define i64 @myctlz64_2(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_2_param_0];
|
||||
; CHECK-NEXT: clz.b64 %r1, %rd1;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd2, %r1;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %rd2;
|
||||
@@ -81,7 +81,7 @@ define i32 @myctlz64_as_32(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_as_32_param_0];
|
||||
; CHECK-NEXT: clz.b64 %r1, %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -96,7 +96,7 @@ define i32 @myctlz64_as_32_2(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_as_32_2_param_0];
|
||||
; CHECK-NEXT: clz.b64 %r1, %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -115,7 +115,7 @@ define i16 @myctlz_ret16(i16 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [myctlz_ret16_param_0];
|
||||
; CHECK-NEXT: clz.b32 %r2, %r1;
|
||||
; CHECK-NEXT: add.s32 %r3, %r2, -16;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -129,7 +129,7 @@ define i16 @myctlz_ret16_2(i16 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [myctlz_ret16_2_param_0];
|
||||
; CHECK-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-NEXT: clz.b32 %r3, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -147,11 +147,11 @@ define void @myctlz_store16(i16 %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [myctlz_store16_param_0];
|
||||
; CHECK-NEXT: clz.b32 %r2, %r1;
|
||||
; CHECK-NEXT: add.s32 %r3, %r2, -16;
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1];
|
||||
; CHECK-NEXT: st.u16 [%rd1], %r3;
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [myctlz_store16_param_1];
|
||||
; CHECK-NEXT: st.b16 [%rd1], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
|
||||
store i16 %val, ptr %b
|
||||
@@ -164,11 +164,11 @@ define void @myctlz_store16_2(i16 %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [myctlz_store16_2_param_0];
|
||||
; CHECK-NEXT: clz.b32 %r2, %r1;
|
||||
; CHECK-NEXT: add.s32 %r3, %r2, -16;
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1];
|
||||
; CHECK-NEXT: st.u16 [%rd1], %r3;
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [myctlz_store16_2_param_1];
|
||||
; CHECK-NEXT: st.b16 [%rd1], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
%val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
|
||||
store i16 %val, ptr %b
|
||||
|
||||
@@ -9,11 +9,11 @@
|
||||
; Verify that loads with different memory types are not subject to CSE
|
||||
; once they are promoted to the same type.
|
||||
;
|
||||
; CHECK: ld.global.v2.u8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
|
||||
; CHECK: st.global.v2.u8 [b], {%[[B1]], %[[B2]]};
|
||||
; CHECK: ld.global.v2.b8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a];
|
||||
; CHECK: st.global.v2.b8 [b], {%[[B1]], %[[B2]]};
|
||||
;
|
||||
; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a];
|
||||
; CHECK: st.global.u32 [c], %[[C]];
|
||||
; CHECK: ld.global.b32 %[[C:r[0-9]+]], [a];
|
||||
; CHECK: st.global.b32 [c], %[[C]];
|
||||
|
||||
define void @test1() #0 {
|
||||
%1 = load <2 x i8>, ptr addrspace(1) @a, align 8
|
||||
|
||||
@@ -66,9 +66,9 @@ define void @define_private_global(i64 %val) {
|
||||
;
|
||||
; Also check that the if-then is still here, otherwise we may not be testing
|
||||
; the "more-than-one-use" part.
|
||||
; CHECK: st.shared.u64 [private_global_used_more_than_once_in_same_fct],
|
||||
; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct],
|
||||
; CHECK: mov.b64 %[[VAR:.*]], 25
|
||||
; CHECK: st.shared.u64 [private_global_used_more_than_once_in_same_fct], %[[VAR]]
|
||||
; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct], %[[VAR]]
|
||||
define void @define_private_global_more_than_one_use(i64 %val, i1 %cond) {
|
||||
store i64 %val, ptr addrspace(3) @private_global_used_more_than_once_in_same_fct
|
||||
br i1 %cond, label %then, label %end
|
||||
|
||||
@@ -13,7 +13,7 @@ define void @discard_global_L2(ptr addrspace(1) %global_ptr) {
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_global_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [discard_global_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: discard.global.L2 [%rd1], 128;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128)
|
||||
@@ -26,7 +26,7 @@ define void @discard_L2(ptr %ptr) {
|
||||
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-PTX64-EMPTY:
|
||||
; CHECK-PTX64-NEXT: // %bb.0:
|
||||
; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [discard_L2_param_0];
|
||||
; CHECK-PTX64-NEXT: discard.L2 [%rd1], 128;
|
||||
; CHECK-PTX64-NEXT: ret;
|
||||
tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128)
|
||||
|
||||
@@ -14,7 +14,7 @@ define i32 @test_disjoint_or_addr(i16 %a) {
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: mov.b64 %rd1, a;
|
||||
; CHECK-NEXT: cvta.global.u64 %rd2, %rd1;
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd2+8];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd2+8];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%a1 = ptrtoint ptr @a to i64
|
||||
|
||||
@@ -18,17 +18,17 @@ define i32 @test_distributed_shared_cluster_common(ptr %ptr, ptr addrspace(3) %s
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_common_param_0];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_common_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_common_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_common_param_1];
|
||||
; CHECK-NEXT: mov.u32 %r1, %ctaid.x;
|
||||
; CHECK-NEXT: xor.b32 %r2, %r1, 1;
|
||||
; CHECK-NEXT: isspacep.shared::cluster %p1, %rd1;
|
||||
; CHECK-NEXT: mapa.u64 %rd3, %rd1, %r2;
|
||||
; CHECK-NEXT: isspacep.shared::cluster %p2, %rd3;
|
||||
; CHECK-NEXT: mapa.shared::cluster.u64 %rd4, %rd2, %r2;
|
||||
; CHECK-NEXT: ld.shared::cluster.u32 %r3, [%rd4];
|
||||
; CHECK-NEXT: ld.shared::cluster.b32 %r3, [%rd4];
|
||||
; CHECK-NEXT: add.s32 %r4, %r3, 42;
|
||||
; CHECK-NEXT: st.shared::cluster.u32 [%rd4], %r4;
|
||||
; CHECK-NEXT: st.shared::cluster.b32 [%rd4], %r4;
|
||||
; CHECK-NEXT: selp.b32 %r5, 1, 0, %p1;
|
||||
; CHECK-NEXT: selp.b32 %r6, 1, 0, %p2;
|
||||
; CHECK-NEXT: add.s32 %r7, %r5, %r6;
|
||||
@@ -64,7 +64,7 @@ define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsme
|
||||
; CHECK-NEXT: .reg .b64 %fd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0];
|
||||
; CHECK-NEXT: mov.b16 %rs1, 0x3C00;
|
||||
; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1;
|
||||
; CHECK-NEXT: mov.b16 %rs3, 0x3F80;
|
||||
@@ -90,7 +90,7 @@ define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_
|
||||
; CHECK-NEXT: .reg .b64 %rd<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0];
|
||||
; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1;
|
||||
; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1;
|
||||
; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1;
|
||||
@@ -142,7 +142,7 @@ define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %ds
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0];
|
||||
; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1;
|
||||
; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1;
|
||||
; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1;
|
||||
@@ -171,7 +171,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
|
||||
; CHECK-NEXT: .reg .b64 %rd<12>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0];
|
||||
; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0;
|
||||
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0;
|
||||
; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0;
|
||||
@@ -205,7 +205,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
|
||||
; CHECK-NEXT: not.b32 %r2, %r36;
|
||||
; CHECK-NEXT: mov.b32 %r37, 1;
|
||||
; CHECK-NEXT: shl.b32 %r3, %r37, %r1;
|
||||
; CHECK-NEXT: ld.shared::cluster.u32 %r38, [%rd1];
|
||||
; CHECK-NEXT: ld.shared::cluster.b32 %r38, [%rd1];
|
||||
; CHECK-NEXT: and.b32 %r48, %r38, %r2;
|
||||
; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -220,7 +220,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
|
||||
; CHECK-NEXT: mov.b32 %r48, %r7;
|
||||
; CHECK-NEXT: @%p2 bra $L__BB4_1;
|
||||
; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31
|
||||
; CHECK-NEXT: ld.shared::cluster.u32 %r40, [%rd1];
|
||||
; CHECK-NEXT: ld.shared::cluster.b32 %r40, [%rd1];
|
||||
; CHECK-NEXT: and.b32 %r49, %r40, %r2;
|
||||
; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -237,7 +237,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
|
||||
; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21
|
||||
; CHECK-NEXT: fence.acq_rel.sys;
|
||||
; CHECK-NEXT: fence.acq_rel.sys;
|
||||
; CHECK-NEXT: ld.shared::cluster.u32 %r42, [%rd1];
|
||||
; CHECK-NEXT: ld.shared::cluster.b32 %r42, [%rd1];
|
||||
; CHECK-NEXT: and.b32 %r50, %r42, %r2;
|
||||
; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -253,7 +253,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
|
||||
; CHECK-NEXT: @%p6 bra $L__BB4_7;
|
||||
; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11
|
||||
; CHECK-NEXT: fence.acq_rel.sys;
|
||||
; CHECK-NEXT: ld.shared::cluster.u32 %r44, [%rd1];
|
||||
; CHECK-NEXT: ld.shared::cluster.b32 %r44, [%rd1];
|
||||
; CHECK-NEXT: and.b32 %r51, %r44, %r2;
|
||||
; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
@@ -270,7 +270,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr
|
||||
; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1
|
||||
; CHECK-NEXT: fence.acq_rel.sys;
|
||||
; CHECK-NEXT: fence.sc.sys;
|
||||
; CHECK-NEXT: ld.shared::cluster.u32 %r46, [%rd1];
|
||||
; CHECK-NEXT: ld.shared::cluster.b32 %r46, [%rd1];
|
||||
; CHECK-NEXT: and.b32 %r52, %r46, %r2;
|
||||
; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop
|
||||
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
|
||||
|
||||
@@ -8,15 +8,15 @@ define float @div_full(float %a, float %b) {
|
||||
; CHECK-NEXT: .reg .b32 %f<9>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [div_full_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [div_full_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [div_full_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [div_full_param_1];
|
||||
; CHECK-NEXT: div.full.f32 %f3, %f1, %f2;
|
||||
; CHECK-NEXT: mov.b32 %f4, 0f40400000;
|
||||
; CHECK-NEXT: div.full.f32 %f5, %f3, %f4;
|
||||
; CHECK-NEXT: div.full.ftz.f32 %f6, %f5, %f2;
|
||||
; CHECK-NEXT: mov.b32 %f7, 0f40800000;
|
||||
; CHECK-NEXT: div.full.ftz.f32 %f8, %f6, %f7;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f8;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f8;
|
||||
; CHECK-NEXT: ret;
|
||||
%1 = call float @llvm.nvvm.div.full(float %a, float %b)
|
||||
%2 = call float @llvm.nvvm.div.full(float %1, float 3.0)
|
||||
|
||||
@@ -15,9 +15,9 @@ define i32 @test_dp4a_u32_u32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_u32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_u32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_u32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_u32_u32_param_2];
|
||||
; CHECK-NEXT: dp4a.u32.u32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -31,7 +31,7 @@ define i32 @test_dp4a_u32imm_u32imm(i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32imm_u32imm_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_u32imm_u32imm_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r2, 0;
|
||||
; CHECK-NEXT: dp4a.u32.u32 %r3, %r2, %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -46,9 +46,9 @@ define i32 @test_dp4a_u32_s32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_s32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_u32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_u32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_u32_s32_param_2];
|
||||
; CHECK-NEXT: dp4a.u32.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -62,9 +62,9 @@ define i32 @test_dp4a_s32_u32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_u32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_s32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_s32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_s32_u32_param_2];
|
||||
; CHECK-NEXT: dp4a.s32.u32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -78,9 +78,9 @@ define i32 @test_dp4a_s32_s32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_s32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_s32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_s32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_s32_s32_param_2];
|
||||
; CHECK-NEXT: dp4a.s32.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -99,9 +99,9 @@ define i32 @test_dp2a_lo_u32_u32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_u32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_u32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_u32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_u32_u32_param_2];
|
||||
; CHECK-NEXT: dp2a.lo.u32.u32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -115,9 +115,9 @@ define i32 @test_dp2a_lo_u32_s32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_s32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_u32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_u32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_u32_s32_param_2];
|
||||
; CHECK-NEXT: dp2a.lo.u32.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -131,9 +131,9 @@ define i32 @test_dp2a_lo_s32_u32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_u32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_s32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_s32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_s32_u32_param_2];
|
||||
; CHECK-NEXT: dp2a.lo.s32.u32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -147,9 +147,9 @@ define i32 @test_dp2a_lo_s32_s32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_s32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_s32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_s32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_s32_s32_param_2];
|
||||
; CHECK-NEXT: dp2a.lo.s32.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -163,9 +163,9 @@ define i32 @test_dp2a_hi_u32_u32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_u32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_u32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_u32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_u32_u32_param_2];
|
||||
; CHECK-NEXT: dp2a.hi.u32.u32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -179,9 +179,9 @@ define i32 @test_dp2a_hi_u32_s32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_s32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_u32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_u32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_u32_s32_param_2];
|
||||
; CHECK-NEXT: dp2a.hi.u32.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -195,9 +195,9 @@ define i32 @test_dp2a_hi_s32_u32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_u32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_s32_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_s32_u32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_s32_u32_param_2];
|
||||
; CHECK-NEXT: dp2a.hi.s32.u32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -211,9 +211,9 @@ define i32 @test_dp2a_hi_s32_s32(i32 %a, i32 %b, i32 %c) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_s32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_s32_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_s32_s32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_s32_s32_param_2];
|
||||
; CHECK-NEXT: dp2a.hi.s32.s32 %r4, %r1, %r2, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -9,17 +9,17 @@ define void @foo(i64 %a, ptr %p0, ptr %p1) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: add.s64 %rd2, %rd1, 7;
|
||||
; CHECK-NEXT: and.b64 %rd3, %rd2, -8;
|
||||
; CHECK-NEXT: alloca.u64 %rd4, %rd3, 16;
|
||||
; CHECK-NEXT: cvta.local.u64 %rd4, %rd4;
|
||||
; CHECK-NEXT: ld.param.u64 %rd5, [foo_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd5, [foo_param_1];
|
||||
; CHECK-NEXT: alloca.u64 %rd6, %rd3, 16;
|
||||
; CHECK-NEXT: cvta.local.u64 %rd6, %rd6;
|
||||
; CHECK-NEXT: ld.param.u64 %rd7, [foo_param_2];
|
||||
; CHECK-NEXT: st.u64 [%rd5], %rd4;
|
||||
; CHECK-NEXT: st.u64 [%rd7], %rd6;
|
||||
; CHECK-NEXT: ld.param.b64 %rd7, [foo_param_2];
|
||||
; CHECK-NEXT: st.b64 [%rd5], %rd4;
|
||||
; CHECK-NEXT: st.b64 [%rd7], %rd6;
|
||||
; CHECK-NEXT: ret;
|
||||
%b = alloca i8, i64 %a, align 16
|
||||
%c = alloca i8, i64 %a, align 16
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
|
||||
; CHECK-NOT: __local_depot
|
||||
|
||||
; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
|
||||
; CHECK-32: ld.param.b32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
|
||||
; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
|
||||
; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
|
||||
; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
|
||||
@@ -20,7 +20,7 @@
|
||||
; CHECK-32-NEXT: .param .b32 param0;
|
||||
; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]];
|
||||
|
||||
; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
|
||||
; CHECK-64: ld.param.b64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
|
||||
; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
|
||||
; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
|
||||
; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
|
||||
|
||||
@@ -14,7 +14,7 @@ define {i32, i1} @elect_sync(i32 %mask) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [elect_sync_param_0];
|
||||
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p1;
|
||||
@@ -51,7 +51,7 @@ define {i32, i1} @elect_sync_twice(i32 %mask) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [elect_sync_twice_param_0];
|
||||
; CHECK-NEXT: elect.sync %r2|%p1, %r1;
|
||||
; CHECK-NEXT: elect.sync %r3|%p2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
define void @foo(ptr nocapture readonly %x_value, ptr nocapture %output) #0 {
|
||||
%1 = load <4 x float>, ptr %x_value, align 16
|
||||
%2 = fpext <4 x float> %1 to <4 x double>
|
||||
; CHECK-NOT: ld.v2.f32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}];
|
||||
; CHECK-NOT: ld.v2.b32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}];
|
||||
; CHECK: cvt.f64.f32
|
||||
; CHECK: cvt.f64.f32
|
||||
; CHECK: cvt.f64.f32
|
||||
|
||||
@@ -11,7 +11,7 @@ define i16 @test_v2i8(i16 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [test_v2i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [test_v2i8_param_0];
|
||||
; CHECK-NEXT: cvt.s16.s8 %rs2, %rs1;
|
||||
; CHECK-NEXT: shr.s16 %rs3, %rs1, 8;
|
||||
; CHECK-NEXT: add.s16 %rs4, %rs2, %rs3;
|
||||
@@ -36,8 +36,8 @@ define i1 @test_v2i8_load(ptr %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_v2i8_load_param_0];
|
||||
; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v2i8_load_param_0];
|
||||
; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd1];
|
||||
; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2;
|
||||
; CHECK-NEXT: and.b16 %rs6, %rs5, 255;
|
||||
; CHECK-NEXT: setp.eq.s16 %p1, %rs6, 0;
|
||||
@@ -59,7 +59,7 @@ define i16 @test_v4i8(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0];
|
||||
; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
|
||||
; CHECK-NEXT: cvt.s8.s32 %rs1, %r2;
|
||||
; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
|
||||
@@ -95,7 +95,7 @@ define i32 @test_v4i8_s32(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<9>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_s32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0];
|
||||
; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8;
|
||||
; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8;
|
||||
; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8;
|
||||
@@ -126,7 +126,7 @@ define i32 @test_v4i8_u32(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<9>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_u32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8;
|
||||
@@ -161,7 +161,7 @@ define i16 @test_v8i8(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_v8i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_v8i8_param_0];
|
||||
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; }
|
||||
; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
|
||||
; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
|
||||
|
||||
@@ -220,7 +220,7 @@ define half @test_frem(half %a, half %b) #0 {
|
||||
|
||||
; CHECK-LABEL: test_store(
|
||||
; CHECK-DAG: ld.param.b16 [[A:%rs[0-9]+]], [test_store_param_0];
|
||||
; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1];
|
||||
; CHECK-DAG: ld.param.b64 %[[PTR:rd[0-9]+]], [test_store_param_1];
|
||||
; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]];
|
||||
; CHECK-NEXT: ret;
|
||||
define void @test_store(half %a, ptr %b) #0 {
|
||||
@@ -229,7 +229,7 @@ define void @test_store(half %a, ptr %b) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_load(
|
||||
; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0];
|
||||
; CHECK: ld.param.b64 %[[PTR:rd[0-9]+]], [test_load_param_0];
|
||||
; CHECK-NEXT: ld.b16 [[R:%rs[0-9]+]], [%[[PTR]]];
|
||||
; CHECK-NEXT: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -239,12 +239,12 @@ define half @test_load(ptr %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: .visible .func test_halfp0a1(
|
||||
; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
|
||||
; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
|
||||
; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
|
||||
; CHECK-DAG: st.u8 [%[[TO]]], [[B0]]
|
||||
; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
|
||||
; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]]
|
||||
; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
|
||||
; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
|
||||
; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
|
||||
; CHECK-DAG: st.b8 [%[[TO]]], [[B0]]
|
||||
; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
|
||||
; CHECK-DAG: st.b8 [%[[TO]]+1], [[B1]]
|
||||
; CHECK: ret
|
||||
define void @test_halfp0a1(ptr noalias readonly %from, ptr %to) {
|
||||
%1 = load half, ptr %from , align 1
|
||||
@@ -357,8 +357,8 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_select_cc_f32_f16(
|
||||
; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-DAG: ld.param.b32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-DAG: ld.param.b32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-DAG: ld.param.b16 [[D:%rs[0-9]+]], [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-F16-NOFTZ: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]]
|
||||
@@ -367,7 +367,7 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 {
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]];
|
||||
; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]]
|
||||
; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]];
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], [[R]];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], [[R]];
|
||||
; CHECK-NEXT: ret;
|
||||
define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
|
||||
%cc = fcmp une half %c, %d
|
||||
@@ -377,8 +377,8 @@ define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 {
|
||||
|
||||
; CHECK-LABEL: test_select_cc_f16_f32(
|
||||
; CHECK-DAG: ld.param.b16 [[A:%rs[0-9]+]], [test_select_cc_f16_f32_param_0];
|
||||
; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
|
||||
; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
|
||||
; CHECK-DAG: ld.param.b32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2];
|
||||
; CHECK-DAG: ld.param.b32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3];
|
||||
; CHECK-NOFTZ-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]]
|
||||
; CHECK-F16-FTZ-DAG: setp.neu.ftz.f32 [[PRED:%p[0-9]+]], [[C]], [[D]]
|
||||
; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_select_cc_f16_f32_param_1];
|
||||
@@ -619,17 +619,17 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
|
||||
; CHECK-LABEL: test_br_cc(
|
||||
; CHECK-DAG: ld.param.b16 [[A:%rs[0-9]+]], [test_br_cc_param_0];
|
||||
; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_br_cc_param_1];
|
||||
; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2];
|
||||
; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3];
|
||||
; CHECK-DAG: ld.param.b64 %[[C:rd[0-9]+]], [test_br_cc_param_2];
|
||||
; CHECK-DAG: ld.param.b64 %[[D:rd[0-9]+]], [test_br_cc_param_3];
|
||||
; CHECK-F16-NOFTZ: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
|
||||
; CHECK-F16-FTZ: setp.lt.ftz.f16 [[PRED:%p[0-9]+]], [[A]], [[B]]
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
|
||||
; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
|
||||
; CHECK-NEXT: @[[PRED]] bra [[LABEL:\$L__BB.*]];
|
||||
; CHECK: st.u32 [%[[C]]],
|
||||
; CHECK: st.b32 [%[[C]]],
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: st.u32 [%[[D]]],
|
||||
; CHECK: st.b32 [%[[D]]],
|
||||
; CHECK: ret;
|
||||
define void @test_br_cc(half %a, half %b, ptr %p1, ptr %p2) #0 {
|
||||
%c = fcmp uge half %a, %b
|
||||
@@ -643,7 +643,7 @@ else:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_phi(
|
||||
; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0];
|
||||
; CHECK: ld.param.b64 %[[P1:rd[0-9]+]], [test_phi_param_0];
|
||||
; CHECK: ld.b16 {{%rs[0-9]+}}, [%[[P1]]];
|
||||
; CHECK: [[LOOP:\$L__BB[0-9_]+]]:
|
||||
; CHECK: mov.b16 [[R:%rs[0-9]+]], [[AB:%rs[0-9]+]];
|
||||
@@ -712,7 +712,7 @@ define i64 @test_fptoui_i64(half %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_uitofp_i32(
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
|
||||
; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0];
|
||||
; CHECK: cvt.rn.f16.u32 [[R:%rs[0-9]+]], [[A]];
|
||||
; CHECK: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
@@ -722,7 +722,7 @@ define half @test_uitofp_i32(i32 %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_uitofp_i64(
|
||||
; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
|
||||
; CHECK: ld.param.b64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0];
|
||||
; CHECK: cvt.rn.f16.u64 [[R:%rs[0-9]+]], [[A]];
|
||||
; CHECK: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
@@ -732,7 +732,7 @@ define half @test_uitofp_i64(i64 %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sitofp_i32(
|
||||
; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
|
||||
; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0];
|
||||
; CHECK: cvt.rn.f16.s32 [[R:%rs[0-9]+]], [[A]];
|
||||
; CHECK: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
@@ -742,7 +742,7 @@ define half @test_sitofp_i32(i32 %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sitofp_i64(
|
||||
; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
|
||||
; CHECK: ld.param.b64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0];
|
||||
; CHECK: cvt.rn.f16.s64 [[R:%rs[0-9]+]], [[A]];
|
||||
; CHECK: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
@@ -752,7 +752,7 @@ define half @test_sitofp_i64(i64 %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_uitofp_i32_fadd(
|
||||
; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
|
||||
; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0];
|
||||
; CHECK-DAG: cvt.rn.f16.u32 [[C:%rs[0-9]+]], [[A]];
|
||||
; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_uitofp_i32_fadd_param_1];
|
||||
; CHECK-F16-NOFTZ: add.rn.f16 [[R:%rs[0-9]+]], [[B]], [[C]];
|
||||
@@ -770,7 +770,7 @@ define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_sitofp_i32_fadd(
|
||||
; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
|
||||
; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0];
|
||||
; CHECK-DAG: cvt.rn.f16.s32 [[C:%rs[0-9]+]], [[A]];
|
||||
; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_sitofp_i32_fadd_param_1];
|
||||
; CHECK-F16-NOFTZ: add.rn.f16 [[R:%rs[0-9]+]], [[B]], [[C]];
|
||||
@@ -788,7 +788,7 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_fptrunc_float(
|
||||
; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
|
||||
; CHECK: ld.param.b32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0];
|
||||
; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[A]];
|
||||
; CHECK: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
@@ -798,7 +798,7 @@ define half @test_fptrunc_float(float %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_fptrunc_double(
|
||||
; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
|
||||
; CHECK: ld.param.b64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0];
|
||||
; CHECK: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[A]];
|
||||
; CHECK: st.param.b16 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
@@ -811,7 +811,7 @@ define half @test_fptrunc_double(double %a) #0 {
|
||||
; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fpext_float_param_0];
|
||||
; CHECK-NOFTZ: cvt.f32.f16 [[R:%f[0-9]+]], [[A]];
|
||||
; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[R:%f[0-9]+]], [[A]];
|
||||
; CHECK: st.param.f32 [func_retval0], [[R]];
|
||||
; CHECK: st.param.b32 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
define float @test_fpext_float(half %a) #0 {
|
||||
%r = fpext half %a to float
|
||||
@@ -821,7 +821,7 @@ define float @test_fpext_float(half %a) #0 {
|
||||
; CHECK-LABEL: test_fpext_double(
|
||||
; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fpext_double_param_0];
|
||||
; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]];
|
||||
; CHECK: st.param.f64 [func_retval0], [[R]];
|
||||
; CHECK: st.param.b64 [func_retval0], [[R]];
|
||||
; CHECK: ret;
|
||||
define double @test_fpext_double(half %a) #0 {
|
||||
%r = fpext half %a to double
|
||||
@@ -840,7 +840,7 @@ define i16 @test_bitcast_halftoi16(half %a) #0 {
|
||||
}
|
||||
|
||||
; CHECK-LABEL: test_bitcast_i16tohalf(
|
||||
; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
|
||||
; CHECK: ld.param.b16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0];
|
||||
; CHECK: st.param.b16 [func_retval0], [[AS]];
|
||||
; CHECK: ret;
|
||||
define half @test_bitcast_i16tohalf(i16 %a) #0 {
|
||||
@@ -1043,7 +1043,7 @@ define half @test_copysign(half %a, half %b) #0 {
|
||||
|
||||
; CHECK-LABEL: test_copysign_f32(
|
||||
; CHECK-DAG: ld.param.b16 [[AH:%rs[0-9]+]], [test_copysign_f32_param_0];
|
||||
; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
|
||||
; CHECK-DAG: ld.param.b32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1];
|
||||
; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]];
|
||||
; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767;
|
||||
; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648;
|
||||
@@ -1059,7 +1059,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
|
||||
|
||||
; CHECK-LABEL: test_copysign_f64(
|
||||
; CHECK-DAG: ld.param.b16 [[AH:%rs[0-9]+]], [test_copysign_f64_param_0];
|
||||
; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
|
||||
; CHECK-DAG: ld.param.b64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1];
|
||||
; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]];
|
||||
; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767;
|
||||
; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808;
|
||||
@@ -1082,7 +1082,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
|
||||
; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]];
|
||||
; CHECK-NOFTZ: cvt.f32.f16 [[XR:%f[0-9]+]], [[RX]];
|
||||
; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[RX]];
|
||||
; CHECK: st.param.f32 [func_retval0], [[XR]];
|
||||
; CHECK: st.param.b32 [func_retval0], [[XR]];
|
||||
; CHECK: ret;
|
||||
define float @test_copysign_extended(half %a, half %b) #0 {
|
||||
%r = call half @llvm.copysign.f16(half %a, half %b)
|
||||
|
||||
@@ -82,7 +82,7 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0];
|
||||
; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0;
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
@@ -390,8 +390,8 @@ define void @test_ldst_v2f16(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f16_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f16_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f16_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f16_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.b32 [%rd2], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -412,11 +412,11 @@ define void @test_ldst_v3f16(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f16_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f16_param_0];
|
||||
; CHECK-NEXT: ld.u64 %rd3, [%rd1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3f16_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3f16_param_0];
|
||||
; CHECK-NEXT: ld.b64 %rd3, [%rd1];
|
||||
; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd3; }
|
||||
; CHECK-NEXT: st.u32 [%rd2], %rd3;
|
||||
; CHECK-NEXT: st.b32 [%rd2], %rd3;
|
||||
; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
|
||||
; CHECK-NEXT: st.b16 [%rd2+4], %rs1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -432,8 +432,8 @@ define void @test_ldst_v4f16(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f16_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f16_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f16_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f16_param_0];
|
||||
; CHECK-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
|
||||
; CHECK-NEXT: st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -449,8 +449,8 @@ define void @test_ldst_v8f16(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f16_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f16_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f16_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f16_param_0];
|
||||
; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -553,7 +553,7 @@ define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1];
|
||||
@@ -626,14 +626,14 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
|
||||
; CHECK-F16-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2;
|
||||
; CHECK-F16-NEXT: selp.f32 %f5, %f2, %f4, %p2;
|
||||
; CHECK-F16-NEXT: selp.f32 %f6, %f1, %f3, %p1;
|
||||
; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5};
|
||||
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%f6, %f5};
|
||||
; CHECK-F16-NEXT: ret;
|
||||
;
|
||||
; CHECK-NOF16-LABEL: test_select_cc_f32_f16(
|
||||
@@ -644,8 +644,8 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
|
||||
; CHECK-NOF16-NEXT: .reg .b32 %f<11>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2];
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
@@ -658,7 +658,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
|
||||
; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f8, %f7;
|
||||
; CHECK-NOF16-NEXT: selp.f32 %f9, %f2, %f4, %p2;
|
||||
; CHECK-NOF16-NEXT: selp.f32 %f10, %f1, %f3, %p1;
|
||||
; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9};
|
||||
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%f10, %f9};
|
||||
; CHECK-NOF16-NEXT: ret;
|
||||
<2 x half> %c, <2 x half> %d) #0 {
|
||||
%cc = fcmp une <2 x half> %c, %d
|
||||
@@ -675,8 +675,8 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
|
||||
; CHECK-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f16_f32_param_3];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f16_f32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0];
|
||||
; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3;
|
||||
@@ -1388,7 +1388,7 @@ define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.u32 %rs1, %r2;
|
||||
; CHECK-NEXT: cvt.rn.f16.u32 %rs2, %r1;
|
||||
; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1};
|
||||
@@ -1406,7 +1406,7 @@ define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.u64 %rs1, %rd2;
|
||||
; CHECK-NEXT: cvt.rn.f16.u64 %rs2, %rd1;
|
||||
; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
|
||||
@@ -1423,7 +1423,7 @@ define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.s32 %rs1, %r2;
|
||||
; CHECK-NEXT: cvt.rn.f16.s32 %rs2, %r1;
|
||||
; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1};
|
||||
@@ -1441,7 +1441,7 @@ define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.s64 %rs1, %rd2;
|
||||
; CHECK-NEXT: cvt.rn.f16.s64 %rs2, %rd1;
|
||||
; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
|
||||
@@ -1459,7 +1459,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
|
||||
; CHECK-F16-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.u32 %rs1, %r2;
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.u32 %rs2, %r1;
|
||||
@@ -1475,7 +1475,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
|
||||
; CHECK-NOF16-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1];
|
||||
; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs1, %r1;
|
||||
; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs2, %r2;
|
||||
@@ -1503,7 +1503,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
|
||||
; CHECK-F16-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.s32 %rs1, %r2;
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.s32 %rs2, %r1;
|
||||
@@ -1519,7 +1519,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
|
||||
; CHECK-NOF16-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1];
|
||||
; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs1, %r1;
|
||||
; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs2, %r2;
|
||||
@@ -1548,7 +1548,7 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %f2;
|
||||
; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f1;
|
||||
; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
|
||||
@@ -1566,7 +1566,7 @@ define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %fd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0];
|
||||
; CHECK-NEXT: cvt.rn.f16.f64 %rs1, %fd2;
|
||||
; CHECK-NEXT: cvt.rn.f16.f64 %rs2, %fd1;
|
||||
; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1};
|
||||
@@ -1588,7 +1588,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NEXT: cvt.f32.f16 %f1, %rs2;
|
||||
; CHECK-NEXT: cvt.f32.f16 %f2, %rs1;
|
||||
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = fpext <2 x half> %a to <2 x float>
|
||||
ret <2 x float> %r
|
||||
@@ -1606,7 +1606,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NEXT: cvt.f64.f16 %fd1, %rs2;
|
||||
; CHECK-NEXT: cvt.f64.f16 %fd2, %rs1;
|
||||
; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%fd2, %fd1};
|
||||
; CHECK-NEXT: ret;
|
||||
%r = fpext <2 x half> %a to <2 x double>
|
||||
ret <2 x double> %r
|
||||
@@ -1619,7 +1619,7 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast <2 x half> %a to <2 x i16>
|
||||
@@ -1632,7 +1632,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast <2 x i16> %a to <2 x half>
|
||||
@@ -1646,7 +1646,7 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [test_bitcast_float_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -1661,9 +1661,9 @@ define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xhalf_to_float_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xhalf_to_float_param_0];
|
||||
; CHECK-NEXT: mov.b32 %f1, %r1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast <2 x half> %a to float
|
||||
ret float %r
|
||||
@@ -1987,7 +1987,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
|
||||
; CHECK-F16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %f2;
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %f1;
|
||||
@@ -2005,7 +2005,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
|
||||
; CHECK-NOF16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0];
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
|
||||
@@ -2034,7 +2034,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
|
||||
; CHECK-F16-NEXT: .reg .b64 %fd<3>;
|
||||
; CHECK-F16-EMPTY:
|
||||
; CHECK-F16-NEXT: // %bb.0:
|
||||
; CHECK-F16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1];
|
||||
; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0];
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs1, %fd2;
|
||||
; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs2, %fd1;
|
||||
@@ -2053,7 +2053,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
|
||||
; CHECK-NOF16-NEXT: .reg .b64 %fd<3>;
|
||||
; CHECK-NOF16-EMPTY:
|
||||
; CHECK-NOF16-NEXT: // %bb.0:
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1];
|
||||
; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0];
|
||||
; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767;
|
||||
@@ -2092,7 +2092,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
|
||||
; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5;
|
||||
; CHECK-F16-NEXT: cvt.f32.f16 %f1, %rs2;
|
||||
; CHECK-F16-NEXT: cvt.f32.f16 %f2, %rs1;
|
||||
; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-F16-NEXT: ret;
|
||||
;
|
||||
; CHECK-NOF16-LABEL: test_copysign_extended(
|
||||
@@ -2114,7 +2114,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
|
||||
; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs10;
|
||||
; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs7;
|
||||
; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1};
|
||||
; CHECK-NOF16-NEXT: ret;
|
||||
%r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
|
||||
%xr = fpext <2 x half> %r to <2 x float>
|
||||
@@ -2359,7 +2359,7 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NEXT: cvt.rn.f16.s16 %rs3, %rs2;
|
||||
; CHECK-NEXT: cvt.rn.f16.s16 %rs4, %rs1;
|
||||
@@ -2377,7 +2377,7 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NEXT: cvt.rn.f16.u16 %rs3, %rs2;
|
||||
; CHECK-NEXT: cvt.rn.f16.u16 %rs4, %rs1;
|
||||
|
||||
@@ -12,9 +12,9 @@ define float @ex2_float(float %0) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [ex2_float_param_0];
|
||||
; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%res = call float @llvm.nvvm.ex2.approx.f(float %0)
|
||||
ret float %res
|
||||
@@ -27,9 +27,9 @@ define float @ex2_float_ftz(float %0) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_ftz_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [ex2_float_ftz_param_0];
|
||||
; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0)
|
||||
ret float %res
|
||||
|
||||
@@ -13,9 +13,9 @@ define float @lg2_float(float %0) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [lg2_float_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [lg2_float_param_0];
|
||||
; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%res = call float @llvm.nvvm.lg2.approx.f(float %0)
|
||||
ret float %res
|
||||
@@ -28,9 +28,9 @@ define float @lg2_float_ftz(float %0) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [lg2_float_ftz_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [lg2_float_ftz_param_0];
|
||||
; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0)
|
||||
ret float %res
|
||||
|
||||
@@ -21,9 +21,9 @@ define float @fabs_float(float %a) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fabs_float_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fabs_float_param_0];
|
||||
; CHECK-NEXT: abs.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = call float @llvm.nvvm.fabs.f32(float %a)
|
||||
ret float %ret
|
||||
@@ -35,9 +35,9 @@ define float @fabs_float_ftz(float %a) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [fabs_float_ftz_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [fabs_float_ftz_param_0];
|
||||
; CHECK-NEXT: abs.ftz.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = call float @llvm.nvvm.fabs.ftz.f32(float %a)
|
||||
ret float %ret
|
||||
@@ -49,9 +49,9 @@ define double @fabs_double(double %a) {
|
||||
; CHECK-NEXT: .reg .b64 %fd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f64 %fd1, [fabs_double_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %fd1, [fabs_double_param_0];
|
||||
; CHECK-NEXT: abs.f64 %fd2, %fd1;
|
||||
; CHECK-NEXT: st.param.f64 [func_retval0], %fd2;
|
||||
; CHECK-NEXT: st.param.b64 [func_retval0], %fd2;
|
||||
; CHECK-NEXT: ret;
|
||||
%ret = call double @llvm.nvvm.fabs.f64(double %a)
|
||||
ret double %ret
|
||||
|
||||
@@ -16,9 +16,9 @@ define float @exp2_test(float %in) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [exp2_test_param_0];
|
||||
; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
;
|
||||
; CHECK-FP16-LABEL: exp2_test(
|
||||
@@ -26,9 +26,9 @@ define float @exp2_test(float %in) {
|
||||
; CHECK-FP16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-FP16-EMPTY:
|
||||
; CHECK-FP16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
|
||||
; CHECK-FP16-NEXT: ld.param.b32 %f1, [exp2_test_param_0];
|
||||
; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1;
|
||||
; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-FP16-NEXT: ret;
|
||||
;
|
||||
; CHECK-BF16-LABEL: exp2_test(
|
||||
@@ -36,9 +36,9 @@ define float @exp2_test(float %in) {
|
||||
; CHECK-BF16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-BF16-EMPTY:
|
||||
; CHECK-BF16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_test_param_0];
|
||||
; CHECK-BF16-NEXT: ld.param.b32 %f1, [exp2_test_param_0];
|
||||
; CHECK-BF16-NEXT: ex2.approx.f32 %f2, %f1;
|
||||
; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-BF16-NEXT: ret;
|
||||
entry:
|
||||
%exp2 = call float @llvm.exp2.f32(float %in)
|
||||
@@ -52,9 +52,9 @@ define float @exp2_ftz_test(float %in) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [exp2_ftz_test_param_0];
|
||||
; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
;
|
||||
; CHECK-FP16-LABEL: exp2_ftz_test(
|
||||
@@ -62,9 +62,9 @@ define float @exp2_ftz_test(float %in) #0 {
|
||||
; CHECK-FP16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-FP16-EMPTY:
|
||||
; CHECK-FP16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
|
||||
; CHECK-FP16-NEXT: ld.param.b32 %f1, [exp2_ftz_test_param_0];
|
||||
; CHECK-FP16-NEXT: ex2.approx.ftz.f32 %f2, %f1;
|
||||
; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-FP16-NEXT: ret;
|
||||
;
|
||||
; CHECK-BF16-LABEL: exp2_ftz_test(
|
||||
@@ -72,9 +72,9 @@ define float @exp2_ftz_test(float %in) #0 {
|
||||
; CHECK-BF16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-BF16-EMPTY:
|
||||
; CHECK-BF16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0];
|
||||
; CHECK-BF16-NEXT: ld.param.b32 %f1, [exp2_ftz_test_param_0];
|
||||
; CHECK-BF16-NEXT: ex2.approx.ftz.f32 %f2, %f1;
|
||||
; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-BF16-NEXT: ret;
|
||||
entry:
|
||||
%exp2 = call float @llvm.exp2.f32(float %in)
|
||||
@@ -88,10 +88,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) {
|
||||
; CHECK-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
|
||||
; CHECK-NEXT: ex2.approx.f32 %f3, %f2;
|
||||
; CHECK-NEXT: ex2.approx.f32 %f4, %f1;
|
||||
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-NEXT: ret;
|
||||
;
|
||||
; CHECK-FP16-LABEL: exp2_test_v(
|
||||
@@ -99,10 +99,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) {
|
||||
; CHECK-FP16-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-FP16-EMPTY:
|
||||
; CHECK-FP16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-FP16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
|
||||
; CHECK-FP16-NEXT: ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
|
||||
; CHECK-FP16-NEXT: ex2.approx.f32 %f3, %f2;
|
||||
; CHECK-FP16-NEXT: ex2.approx.f32 %f4, %f1;
|
||||
; CHECK-FP16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-FP16-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-FP16-NEXT: ret;
|
||||
;
|
||||
; CHECK-BF16-LABEL: exp2_test_v(
|
||||
@@ -110,10 +110,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) {
|
||||
; CHECK-BF16-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-BF16-EMPTY:
|
||||
; CHECK-BF16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-BF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0];
|
||||
; CHECK-BF16-NEXT: ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0];
|
||||
; CHECK-BF16-NEXT: ex2.approx.f32 %f3, %f2;
|
||||
; CHECK-BF16-NEXT: ex2.approx.f32 %f4, %f1;
|
||||
; CHECK-BF16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-BF16-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-BF16-NEXT: ret;
|
||||
entry:
|
||||
%exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
|
||||
@@ -259,7 +259,7 @@ define bfloat @exp2_bf16_test(bfloat %in) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [exp2_bf16_test_param_0];
|
||||
; CHECK-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-NEXT: ex2.approx.f32 %f2, %f1;
|
||||
@@ -282,7 +282,7 @@ define bfloat @exp2_bf16_test(bfloat %in) {
|
||||
; CHECK-FP16-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-FP16-EMPTY:
|
||||
; CHECK-FP16-NEXT: // %bb.0: // %entry
|
||||
; CHECK-FP16-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0];
|
||||
; CHECK-FP16-NEXT: ld.param.b16 %r1, [exp2_bf16_test_param_0];
|
||||
; CHECK-FP16-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-FP16-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1;
|
||||
|
||||
@@ -10,7 +10,7 @@ define i32 @flo_1(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [flo_1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [flo_1_param_0];
|
||||
; CHECK-NEXT: bfind.s32 %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -25,7 +25,7 @@ define i32 @flo_2(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [flo_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [flo_2_param_0];
|
||||
; CHECK-NEXT: bfind.shiftamt.s32 %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -39,7 +39,7 @@ define i32 @flo_3(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [flo_3_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [flo_3_param_0];
|
||||
; CHECK-NEXT: bfind.u32 %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -54,7 +54,7 @@ define i32 @flo_4(i32 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [flo_4_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [flo_4_param_0];
|
||||
; CHECK-NEXT: bfind.shiftamt.u32 %r2, %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -71,7 +71,7 @@ define i32 @flo_5(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [flo_5_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [flo_5_param_0];
|
||||
; CHECK-NEXT: bfind.s64 %r1, %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -87,7 +87,7 @@ define i32 @flo_6(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [flo_6_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [flo_6_param_0];
|
||||
; CHECK-NEXT: bfind.shiftamt.s64 %r1, %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -102,7 +102,7 @@ define i32 @flo_7(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [flo_7_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [flo_7_param_0];
|
||||
; CHECK-NEXT: bfind.u64 %r1, %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -118,7 +118,7 @@ define i32 @flo_8(i64 %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [flo_8_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [flo_8_param_0];
|
||||
; CHECK-NEXT: bfind.shiftamt.u64 %r1, %rd1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -10,9 +10,9 @@ define float @log2_test(float %in) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [log2_test_param_0];
|
||||
; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%log2 = call float @llvm.log2.f32(float %in)
|
||||
@@ -26,9 +26,9 @@ define float @log2_ftz_test(float %in) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [log2_ftz_test_param_0];
|
||||
; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f2;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%log2 = call float @llvm.log2.f32(float %in)
|
||||
@@ -42,10 +42,10 @@ define <2 x float> @log2_test_v(<2 x float> %in) {
|
||||
; CHECK-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [log2_test_v_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [log2_test_v_param_0];
|
||||
; CHECK-NEXT: lg2.approx.f32 %f3, %f2;
|
||||
; CHECK-NEXT: lg2.approx.f32 %f4, %f1;
|
||||
; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3};
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
|
||||
@@ -129,7 +129,7 @@ define bfloat @log2_bf16_test(bfloat %in) {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [log2_bf16_test_param_0];
|
||||
; CHECK-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-NEXT: lg2.approx.f32 %f2, %f1;
|
||||
@@ -158,7 +158,7 @@ define bfloat @log2_bf16_ftz_test(bfloat %in) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_ftz_test_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %r1, [log2_bf16_ftz_test_param_0];
|
||||
; CHECK-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1;
|
||||
|
||||
@@ -253,13 +253,13 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat %
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<6>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -317,13 +317,13 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<6>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -405,13 +405,13 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<10>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -493,13 +493,13 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
|
||||
@@ -187,13 +187,13 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<6>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -267,13 +267,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<9>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -348,13 +348,13 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 {
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
|
||||
@@ -198,13 +198,13 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) {
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<6>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -286,13 +286,13 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<10>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -376,13 +376,13 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c)
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -1134,13 +1134,13 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) {
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<6>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -1214,13 +1214,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<9>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
@@ -1295,13 +1295,13 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) {
|
||||
; CHECK-SM70-NEXT: .reg .b32 %f<7>;
|
||||
; CHECK-SM70-EMPTY:
|
||||
; CHECK-SM70-NEXT: // %bb.0:
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f1, %r2;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f2, %r4;
|
||||
; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0];
|
||||
; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16;
|
||||
; CHECK-SM70-NEXT: mov.b32 %f3, %r6;
|
||||
; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1;
|
||||
|
||||
@@ -5,9 +5,9 @@ declare i32 @llvm.nvvm.fns(i32, i32, i32)
|
||||
|
||||
; CHECK-LABEL: .func{{.*}}fns
|
||||
define i32 @fns(i32 %mask, i32 %base, i32 %offset) {
|
||||
; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [fns_param_0];
|
||||
; CHECK: ld.param.u32 [[BASE:%r[0-9]+]], [fns_param_1];
|
||||
; CHECK: ld.param.u32 [[OFFSET:%r[0-9]+]], [fns_param_2];
|
||||
; CHECK: ld.param.b32 [[MASK:%r[0-9]+]], [fns_param_0];
|
||||
; CHECK: ld.param.b32 [[BASE:%r[0-9]+]], [fns_param_1];
|
||||
; CHECK: ld.param.b32 [[OFFSET:%r[0-9]+]], [fns_param_2];
|
||||
|
||||
; CHECK: fns.b32 {{%r[0-9]+}}, [[MASK]], [[BASE]], [[OFFSET]];
|
||||
%r0 = call i32 @llvm.nvvm.fns(i32 %mask, i32 %base, i32 %offset);
|
||||
|
||||
@@ -10,7 +10,7 @@ define i32 @test_ld_param_const(ptr byval(i32) %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_const_param_0+4];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_const_param_0+4];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%p2 = getelementptr i32, ptr %a, i32 1
|
||||
@@ -28,7 +28,7 @@ define i32 @test_ld_param_non_const(ptr byval([10 x i32]) %a, i32 %b) {
|
||||
; CHECK-NEXT: mov.b64 %rd1, test_ld_param_non_const_param_0;
|
||||
; CHECK-NEXT: ld.param.s32 %rd2, [test_ld_param_non_const_param_1];
|
||||
; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2;
|
||||
; CHECK-NEXT: ld.local.u32 %r1, [%rd3];
|
||||
; CHECK-NEXT: ld.local.b32 %r1, [%rd3];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%p2 = getelementptr i8, ptr %a, i32 %b
|
||||
@@ -68,7 +68,7 @@ define void @test_ld_param_byval(ptr byval(i32) %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_byval_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_byval_param_0];
|
||||
; CHECK-NEXT: { // callseq 1, 0
|
||||
; CHECK-NEXT: .param .align 4 .b8 param0[4];
|
||||
; CHECK-NEXT: st.param.b32 [param0], %r1;
|
||||
@@ -91,9 +91,9 @@ define i32 @test_modify_param(ptr byval([10 x i32]) %a, i32 %b, i32 %c ) {
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: mov.b64 %rd1, test_modify_param_param_0;
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_modify_param_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_modify_param_param_2];
|
||||
; CHECK-NEXT: st.local.u32 [%rd1+2], %r1;
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_modify_param_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_modify_param_param_2];
|
||||
; CHECK-NEXT: st.local.b32 [%rd1+2], %r1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
%p2 = getelementptr i8, ptr %a, i32 2
|
||||
@@ -110,16 +110,16 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_multi_block_param_1];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_multi_block_param_1];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: not.pred %p2, %p1;
|
||||
; CHECK-NEXT: @%p2 bra $L__BB5_2;
|
||||
; CHECK-NEXT: // %bb.1: // %if
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test_multi_block_param_0+4];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+4];
|
||||
; CHECK-NEXT: bra.uni $L__BB5_3;
|
||||
; CHECK-NEXT: $L__BB5_2: // %else
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test_multi_block_param_0+8];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+8];
|
||||
; CHECK-NEXT: $L__BB5_3: // %end
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -18,11 +18,11 @@ define float @t0(float %a, float %b, float %c) {
|
||||
; FAST-NEXT: .reg .b32 %f<5>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f32 %f1, [t0_param_0];
|
||||
; FAST-NEXT: ld.param.f32 %f2, [t0_param_1];
|
||||
; FAST-NEXT: ld.param.f32 %f3, [t0_param_2];
|
||||
; FAST-NEXT: ld.param.b32 %f1, [t0_param_0];
|
||||
; FAST-NEXT: ld.param.b32 %f2, [t0_param_1];
|
||||
; FAST-NEXT: ld.param.b32 %f3, [t0_param_2];
|
||||
; FAST-NEXT: fma.rn.f32 %f4, %f1, %f2, %f3;
|
||||
; FAST-NEXT: st.param.f32 [func_retval0], %f4;
|
||||
; FAST-NEXT: st.param.b32 [func_retval0], %f4;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; DEFAULT-LABEL: t0(
|
||||
@@ -30,12 +30,12 @@ define float @t0(float %a, float %b, float %c) {
|
||||
; DEFAULT-NEXT: .reg .b32 %f<6>;
|
||||
; DEFAULT-EMPTY:
|
||||
; DEFAULT-NEXT: // %bb.0:
|
||||
; DEFAULT-NEXT: ld.param.f32 %f1, [t0_param_0];
|
||||
; DEFAULT-NEXT: ld.param.f32 %f2, [t0_param_1];
|
||||
; DEFAULT-NEXT: ld.param.b32 %f1, [t0_param_0];
|
||||
; DEFAULT-NEXT: ld.param.b32 %f2, [t0_param_1];
|
||||
; DEFAULT-NEXT: mul.rn.f32 %f3, %f1, %f2;
|
||||
; DEFAULT-NEXT: ld.param.f32 %f4, [t0_param_2];
|
||||
; DEFAULT-NEXT: ld.param.b32 %f4, [t0_param_2];
|
||||
; DEFAULT-NEXT: add.rn.f32 %f5, %f3, %f4;
|
||||
; DEFAULT-NEXT: st.param.f32 [func_retval0], %f5;
|
||||
; DEFAULT-NEXT: st.param.b32 [func_retval0], %f5;
|
||||
; DEFAULT-NEXT: ret;
|
||||
%v0 = fmul float %a, %b
|
||||
%v1 = fadd float %v0, %c
|
||||
@@ -50,12 +50,12 @@ define float @t1(float %a, float %b) {
|
||||
; FAST-NEXT: .reg .b32 %f<6>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f32 %f1, [t1_param_0];
|
||||
; FAST-NEXT: ld.param.f32 %f2, [t1_param_1];
|
||||
; FAST-NEXT: ld.param.b32 %f1, [t1_param_0];
|
||||
; FAST-NEXT: ld.param.b32 %f2, [t1_param_1];
|
||||
; FAST-NEXT: add.f32 %f3, %f1, %f2;
|
||||
; FAST-NEXT: sub.f32 %f4, %f1, %f2;
|
||||
; FAST-NEXT: mul.f32 %f5, %f3, %f4;
|
||||
; FAST-NEXT: st.param.f32 [func_retval0], %f5;
|
||||
; FAST-NEXT: st.param.b32 [func_retval0], %f5;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; DEFAULT-LABEL: t1(
|
||||
@@ -63,12 +63,12 @@ define float @t1(float %a, float %b) {
|
||||
; DEFAULT-NEXT: .reg .b32 %f<6>;
|
||||
; DEFAULT-EMPTY:
|
||||
; DEFAULT-NEXT: // %bb.0:
|
||||
; DEFAULT-NEXT: ld.param.f32 %f1, [t1_param_0];
|
||||
; DEFAULT-NEXT: ld.param.f32 %f2, [t1_param_1];
|
||||
; DEFAULT-NEXT: ld.param.b32 %f1, [t1_param_0];
|
||||
; DEFAULT-NEXT: ld.param.b32 %f2, [t1_param_1];
|
||||
; DEFAULT-NEXT: add.rn.f32 %f3, %f1, %f2;
|
||||
; DEFAULT-NEXT: sub.rn.f32 %f4, %f1, %f2;
|
||||
; DEFAULT-NEXT: mul.rn.f32 %f5, %f3, %f4;
|
||||
; DEFAULT-NEXT: st.param.f32 [func_retval0], %f5;
|
||||
; DEFAULT-NEXT: st.param.b32 [func_retval0], %f5;
|
||||
; DEFAULT-NEXT: ret;
|
||||
%v1 = fadd float %a, %b
|
||||
%v2 = fsub float %a, %b
|
||||
@@ -84,12 +84,12 @@ define float @t2(float %a, float %b) {
|
||||
; CHECK-NEXT: .reg .b32 %f<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [t2_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [t2_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [t2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [t2_param_1];
|
||||
; CHECK-NEXT: add.f32 %f3, %f1, %f2;
|
||||
; CHECK-NEXT: sub.f32 %f4, %f1, %f2;
|
||||
; CHECK-NEXT: mul.f32 %f5, %f3, %f4;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f5;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f5;
|
||||
; CHECK-NEXT: ret;
|
||||
%v1 = fadd contract float %a, %b
|
||||
%v2 = fsub contract float %a, %b
|
||||
@@ -104,11 +104,11 @@ define float @t3(float %a, float %b, float %c) {
|
||||
; CHECK-NEXT: .reg .b32 %f<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [t3_param_0];
|
||||
; CHECK-NEXT: ld.param.f32 %f2, [t3_param_1];
|
||||
; CHECK-NEXT: ld.param.f32 %f3, [t3_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [t3_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f2, [t3_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %f3, [t3_param_2];
|
||||
; CHECK-NEXT: fma.rn.f32 %f4, %f1, %f2, %f3;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f4;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f4;
|
||||
; CHECK-NEXT: ret;
|
||||
%v0 = fmul contract float %a, %b
|
||||
%v1 = fadd contract float %v0, %c
|
||||
|
||||
@@ -10,7 +10,7 @@ define fp128 @identity(fp128 %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [identity_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [identity_param_0];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
|
||||
; CHECK-NEXT: ret;
|
||||
ret fp128 %x
|
||||
@@ -22,10 +22,10 @@ define void @load_store(ptr %in, ptr %out) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [load_store_param_0];
|
||||
; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd4, [load_store_param_1];
|
||||
; CHECK-NEXT: st.v2.u64 [%rd4], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [load_store_param_0];
|
||||
; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd4, [load_store_param_1];
|
||||
; CHECK-NEXT: st.v2.b64 [%rd4], {%rd2, %rd3};
|
||||
; CHECK-NEXT: ret;
|
||||
%val = load fp128, ptr %in
|
||||
store fp128 %val, ptr %out
|
||||
@@ -38,7 +38,7 @@ define void @call(fp128 %x) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [call_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [call_param_0];
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 16 .b8 param0[16];
|
||||
; CHECK-NEXT: st.param.v2.b64 [param0], {%rd1, %rd2};
|
||||
|
||||
@@ -54,13 +54,13 @@ define float @frem_f32(float %a, float %b) {
|
||||
; FAST-NEXT: .reg .b32 %f<7>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f32 %f1, [frem_f32_param_0];
|
||||
; FAST-NEXT: ld.param.f32 %f2, [frem_f32_param_1];
|
||||
; FAST-NEXT: ld.param.b32 %f1, [frem_f32_param_0];
|
||||
; FAST-NEXT: ld.param.b32 %f2, [frem_f32_param_1];
|
||||
; FAST-NEXT: div.approx.f32 %f3, %f1, %f2;
|
||||
; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3;
|
||||
; FAST-NEXT: neg.f32 %f5, %f4;
|
||||
; FAST-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1;
|
||||
; FAST-NEXT: st.param.f32 [func_retval0], %f6;
|
||||
; FAST-NEXT: st.param.b32 [func_retval0], %f6;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; NORMAL-LABEL: frem_f32(
|
||||
@@ -69,15 +69,15 @@ define float @frem_f32(float %a, float %b) {
|
||||
; NORMAL-NEXT: .reg .b32 %f<8>;
|
||||
; NORMAL-EMPTY:
|
||||
; NORMAL-NEXT: // %bb.0:
|
||||
; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_param_0];
|
||||
; NORMAL-NEXT: ld.param.f32 %f2, [frem_f32_param_1];
|
||||
; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_param_0];
|
||||
; NORMAL-NEXT: ld.param.b32 %f2, [frem_f32_param_1];
|
||||
; NORMAL-NEXT: div.rn.f32 %f3, %f1, %f2;
|
||||
; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3;
|
||||
; NORMAL-NEXT: neg.f32 %f5, %f4;
|
||||
; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1;
|
||||
; NORMAL-NEXT: testp.infinite.f32 %p1, %f2;
|
||||
; NORMAL-NEXT: selp.f32 %f7, %f1, %f6, %p1;
|
||||
; NORMAL-NEXT: st.param.f32 [func_retval0], %f7;
|
||||
; NORMAL-NEXT: st.param.b32 [func_retval0], %f7;
|
||||
; NORMAL-NEXT: ret;
|
||||
%r = frem float %a, %b
|
||||
ret float %r
|
||||
@@ -89,13 +89,13 @@ define double @frem_f64(double %a, double %b) {
|
||||
; FAST-NEXT: .reg .b64 %fd<7>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f64 %fd1, [frem_f64_param_0];
|
||||
; FAST-NEXT: ld.param.f64 %fd2, [frem_f64_param_1];
|
||||
; FAST-NEXT: ld.param.b64 %fd1, [frem_f64_param_0];
|
||||
; FAST-NEXT: ld.param.b64 %fd2, [frem_f64_param_1];
|
||||
; FAST-NEXT: div.rn.f64 %fd3, %fd1, %fd2;
|
||||
; FAST-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
|
||||
; FAST-NEXT: neg.f64 %fd5, %fd4;
|
||||
; FAST-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
|
||||
; FAST-NEXT: st.param.f64 [func_retval0], %fd6;
|
||||
; FAST-NEXT: st.param.b64 [func_retval0], %fd6;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; NORMAL-LABEL: frem_f64(
|
||||
@@ -104,15 +104,15 @@ define double @frem_f64(double %a, double %b) {
|
||||
; NORMAL-NEXT: .reg .b64 %fd<8>;
|
||||
; NORMAL-EMPTY:
|
||||
; NORMAL-NEXT: // %bb.0:
|
||||
; NORMAL-NEXT: ld.param.f64 %fd1, [frem_f64_param_0];
|
||||
; NORMAL-NEXT: ld.param.f64 %fd2, [frem_f64_param_1];
|
||||
; NORMAL-NEXT: ld.param.b64 %fd1, [frem_f64_param_0];
|
||||
; NORMAL-NEXT: ld.param.b64 %fd2, [frem_f64_param_1];
|
||||
; NORMAL-NEXT: div.rn.f64 %fd3, %fd1, %fd2;
|
||||
; NORMAL-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
|
||||
; NORMAL-NEXT: neg.f64 %fd5, %fd4;
|
||||
; NORMAL-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
|
||||
; NORMAL-NEXT: testp.infinite.f64 %p1, %fd2;
|
||||
; NORMAL-NEXT: selp.f64 %fd7, %fd1, %fd6, %p1;
|
||||
; NORMAL-NEXT: st.param.f64 [func_retval0], %fd7;
|
||||
; NORMAL-NEXT: st.param.b64 [func_retval0], %fd7;
|
||||
; NORMAL-NEXT: ret;
|
||||
%r = frem double %a, %b
|
||||
ret double %r
|
||||
@@ -164,13 +164,13 @@ define float @frem_f32_ninf(float %a, float %b) {
|
||||
; FAST-NEXT: .reg .b32 %f<7>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f32 %f1, [frem_f32_ninf_param_0];
|
||||
; FAST-NEXT: ld.param.f32 %f2, [frem_f32_ninf_param_1];
|
||||
; FAST-NEXT: ld.param.b32 %f1, [frem_f32_ninf_param_0];
|
||||
; FAST-NEXT: ld.param.b32 %f2, [frem_f32_ninf_param_1];
|
||||
; FAST-NEXT: div.approx.f32 %f3, %f1, %f2;
|
||||
; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3;
|
||||
; FAST-NEXT: neg.f32 %f5, %f4;
|
||||
; FAST-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1;
|
||||
; FAST-NEXT: st.param.f32 [func_retval0], %f6;
|
||||
; FAST-NEXT: st.param.b32 [func_retval0], %f6;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; NORMAL-LABEL: frem_f32_ninf(
|
||||
@@ -178,13 +178,13 @@ define float @frem_f32_ninf(float %a, float %b) {
|
||||
; NORMAL-NEXT: .reg .b32 %f<7>;
|
||||
; NORMAL-EMPTY:
|
||||
; NORMAL-NEXT: // %bb.0:
|
||||
; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_ninf_param_0];
|
||||
; NORMAL-NEXT: ld.param.f32 %f2, [frem_f32_ninf_param_1];
|
||||
; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_ninf_param_0];
|
||||
; NORMAL-NEXT: ld.param.b32 %f2, [frem_f32_ninf_param_1];
|
||||
; NORMAL-NEXT: div.rn.f32 %f3, %f1, %f2;
|
||||
; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3;
|
||||
; NORMAL-NEXT: neg.f32 %f5, %f4;
|
||||
; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1;
|
||||
; NORMAL-NEXT: st.param.f32 [func_retval0], %f6;
|
||||
; NORMAL-NEXT: st.param.b32 [func_retval0], %f6;
|
||||
; NORMAL-NEXT: ret;
|
||||
%r = frem ninf float %a, %b
|
||||
ret float %r
|
||||
@@ -196,13 +196,13 @@ define double @frem_f64_ninf(double %a, double %b) {
|
||||
; FAST-NEXT: .reg .b64 %fd<7>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f64 %fd1, [frem_f64_ninf_param_0];
|
||||
; FAST-NEXT: ld.param.f64 %fd2, [frem_f64_ninf_param_1];
|
||||
; FAST-NEXT: ld.param.b64 %fd1, [frem_f64_ninf_param_0];
|
||||
; FAST-NEXT: ld.param.b64 %fd2, [frem_f64_ninf_param_1];
|
||||
; FAST-NEXT: div.rn.f64 %fd3, %fd1, %fd2;
|
||||
; FAST-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
|
||||
; FAST-NEXT: neg.f64 %fd5, %fd4;
|
||||
; FAST-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
|
||||
; FAST-NEXT: st.param.f64 [func_retval0], %fd6;
|
||||
; FAST-NEXT: st.param.b64 [func_retval0], %fd6;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; NORMAL-LABEL: frem_f64_ninf(
|
||||
@@ -210,13 +210,13 @@ define double @frem_f64_ninf(double %a, double %b) {
|
||||
; NORMAL-NEXT: .reg .b64 %fd<7>;
|
||||
; NORMAL-EMPTY:
|
||||
; NORMAL-NEXT: // %bb.0:
|
||||
; NORMAL-NEXT: ld.param.f64 %fd1, [frem_f64_ninf_param_0];
|
||||
; NORMAL-NEXT: ld.param.f64 %fd2, [frem_f64_ninf_param_1];
|
||||
; NORMAL-NEXT: ld.param.b64 %fd1, [frem_f64_ninf_param_0];
|
||||
; NORMAL-NEXT: ld.param.b64 %fd2, [frem_f64_ninf_param_1];
|
||||
; NORMAL-NEXT: div.rn.f64 %fd3, %fd1, %fd2;
|
||||
; NORMAL-NEXT: cvt.rzi.f64.f64 %fd4, %fd3;
|
||||
; NORMAL-NEXT: neg.f64 %fd5, %fd4;
|
||||
; NORMAL-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1;
|
||||
; NORMAL-NEXT: st.param.f64 [func_retval0], %fd6;
|
||||
; NORMAL-NEXT: st.param.b64 [func_retval0], %fd6;
|
||||
; NORMAL-NEXT: ret;
|
||||
%r = frem ninf double %a, %b
|
||||
ret double %r
|
||||
@@ -228,11 +228,11 @@ define float @frem_f32_imm1(float %a) {
|
||||
; FAST-NEXT: .reg .b32 %f<5>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f32 %f1, [frem_f32_imm1_param_0];
|
||||
; FAST-NEXT: ld.param.b32 %f1, [frem_f32_imm1_param_0];
|
||||
; FAST-NEXT: mul.f32 %f2, %f1, 0f3E124925;
|
||||
; FAST-NEXT: cvt.rzi.f32.f32 %f3, %f2;
|
||||
; FAST-NEXT: fma.rn.f32 %f4, %f3, 0fC0E00000, %f1;
|
||||
; FAST-NEXT: st.param.f32 [func_retval0], %f4;
|
||||
; FAST-NEXT: st.param.b32 [func_retval0], %f4;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; NORMAL-LABEL: frem_f32_imm1(
|
||||
@@ -240,11 +240,11 @@ define float @frem_f32_imm1(float %a) {
|
||||
; NORMAL-NEXT: .reg .b32 %f<5>;
|
||||
; NORMAL-EMPTY:
|
||||
; NORMAL-NEXT: // %bb.0:
|
||||
; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_imm1_param_0];
|
||||
; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_imm1_param_0];
|
||||
; NORMAL-NEXT: div.rn.f32 %f2, %f1, 0f40E00000;
|
||||
; NORMAL-NEXT: cvt.rzi.f32.f32 %f3, %f2;
|
||||
; NORMAL-NEXT: fma.rn.f32 %f4, %f3, 0fC0E00000, %f1;
|
||||
; NORMAL-NEXT: st.param.f32 [func_retval0], %f4;
|
||||
; NORMAL-NEXT: st.param.b32 [func_retval0], %f4;
|
||||
; NORMAL-NEXT: ret;
|
||||
%r = frem float %a, 7.0
|
||||
ret float %r
|
||||
@@ -256,13 +256,13 @@ define float @frem_f32_imm2(float %a) {
|
||||
; FAST-NEXT: .reg .b32 %f<7>;
|
||||
; FAST-EMPTY:
|
||||
; FAST-NEXT: // %bb.0:
|
||||
; FAST-NEXT: ld.param.f32 %f1, [frem_f32_imm2_param_0];
|
||||
; FAST-NEXT: ld.param.b32 %f1, [frem_f32_imm2_param_0];
|
||||
; FAST-NEXT: mov.b32 %f2, 0f40E00000;
|
||||
; FAST-NEXT: div.approx.f32 %f3, %f2, %f1;
|
||||
; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3;
|
||||
; FAST-NEXT: neg.f32 %f5, %f4;
|
||||
; FAST-NEXT: fma.rn.f32 %f6, %f5, %f1, 0f40E00000;
|
||||
; FAST-NEXT: st.param.f32 [func_retval0], %f6;
|
||||
; FAST-NEXT: st.param.b32 [func_retval0], %f6;
|
||||
; FAST-NEXT: ret;
|
||||
;
|
||||
; NORMAL-LABEL: frem_f32_imm2(
|
||||
@@ -271,7 +271,7 @@ define float @frem_f32_imm2(float %a) {
|
||||
; NORMAL-NEXT: .reg .b32 %f<8>;
|
||||
; NORMAL-EMPTY:
|
||||
; NORMAL-NEXT: // %bb.0:
|
||||
; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_imm2_param_0];
|
||||
; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_imm2_param_0];
|
||||
; NORMAL-NEXT: mov.b32 %f2, 0f40E00000;
|
||||
; NORMAL-NEXT: div.rn.f32 %f3, %f2, %f1;
|
||||
; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3;
|
||||
@@ -279,7 +279,7 @@ define float @frem_f32_imm2(float %a) {
|
||||
; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f1, 0f40E00000;
|
||||
; NORMAL-NEXT: testp.infinite.f32 %p1, %f1;
|
||||
; NORMAL-NEXT: selp.f32 %f7, 0f40E00000, %f6, %p1;
|
||||
; NORMAL-NEXT: st.param.f32 [func_retval0], %f7;
|
||||
; NORMAL-NEXT: st.param.b32 [func_retval0], %f7;
|
||||
; NORMAL-NEXT: ret;
|
||||
%r = frem float 7.0, %a
|
||||
ret float %r
|
||||
|
||||
@@ -13,9 +13,9 @@ define i32 @fshr_clamp_r(i32 %hi, i32 %lo, i32 %n) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [fshr_clamp_r_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [fshr_clamp_r_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [fshr_clamp_r_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [fshr_clamp_r_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [fshr_clamp_r_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [fshr_clamp_r_param_2];
|
||||
; CHECK-NEXT: shf.r.clamp.b32 %r4, %r2, %r1, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -29,9 +29,9 @@ define i32 @fshl_clamp_r(i32 %hi, i32 %lo, i32 %n) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [fshl_clamp_r_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [fshl_clamp_r_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [fshl_clamp_r_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [fshl_clamp_r_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [fshl_clamp_r_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [fshl_clamp_r_param_2];
|
||||
; CHECK-NEXT: shf.l.clamp.b32 %r4, %r2, %r1, %r3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r4;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -45,8 +45,8 @@ define i32 @fshr_clamp_i(i32 %hi, i32 %lo) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [fshr_clamp_i_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [fshr_clamp_i_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [fshr_clamp_i_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [fshr_clamp_i_param_1];
|
||||
; CHECK-NEXT: shf.r.clamp.b32 %r3, %r2, %r1, 3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -60,8 +60,8 @@ define i32 @fshl_clamp_i(i32 %hi, i32 %lo) {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [fshl_clamp_i_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [fshl_clamp_i_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [fshl_clamp_i_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [fshl_clamp_i_param_1];
|
||||
; CHECK-NEXT: shf.l.clamp.b32 %r3, %r2, %r1, 3;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -15,9 +15,9 @@ target triple = "nvptx-nvidia-cuda"
|
||||
define ptx_kernel void @foo(ptr %a, ptr %b) {
|
||||
; Expect one load -- @myconst isn't loaded from, because we know its value
|
||||
; statically.
|
||||
; CHECK: ld.global.u32
|
||||
; CHECK: st.global.u32
|
||||
; CHECK: st.global.u32
|
||||
; CHECK: ld.global.b32
|
||||
; CHECK: st.global.b32
|
||||
; CHECK: st.global.b32
|
||||
%ld1 = load i32, ptr @myglobal
|
||||
%ld2 = load i32, ptr @myconst
|
||||
store i32 %ld1, ptr %a
|
||||
|
||||
@@ -7,10 +7,10 @@
|
||||
; CHK-LABEL: foo
|
||||
define void @foo(float %f) {
|
||||
entry:
|
||||
; CHK: ld.shared.f32 %{{[a-zA-Z0-9]+}}, [Gbl+8];
|
||||
; CHK: ld.shared.b32 %{{[a-zA-Z0-9]+}}, [Gbl+8];
|
||||
%0 = load float, ptr addrspace(3) getelementptr inbounds ([1024 x %MyStruct], ptr addrspace(3) @Gbl, i32 0, i32 0, i32 2)
|
||||
%add = fadd float %0, %f
|
||||
; CHK: st.shared.f32 [Gbl+8], %{{[a-zA-Z0-9]+}};
|
||||
; CHK: st.shared.b32 [Gbl+8], %{{[a-zA-Z0-9]+}};
|
||||
store float %add, ptr addrspace(3) getelementptr inbounds ([1024 x %MyStruct], ptr addrspace(3) @Gbl, i32 0, i32 0, i32 2)
|
||||
ret void
|
||||
}
|
||||
|
||||
@@ -26,8 +26,8 @@ define void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out)
|
||||
|
||||
define void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) {
|
||||
; CHECK-LABEL: @test_bitcast_to_half
|
||||
; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
|
||||
; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]]
|
||||
; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}]
|
||||
; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]]
|
||||
%val = load i16, ptr addrspace(1) %in
|
||||
%val_fp = bitcast i16 %val to half
|
||||
store half %val_fp, ptr addrspace(1) %out
|
||||
|
||||
@@ -11,15 +11,15 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) {
|
||||
; CHECK: .reg .b32 %r<4>;
|
||||
; CHECK: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK: ld.param.u64 %rd1, [foo_param_0];
|
||||
; CHECK: ld.param.b64 %rd1, [foo_param_0];
|
||||
; CHECK: cvta.to.global.u64 %rd2, %rd1;
|
||||
; CHECK: ld.param.u64 %rd3, [foo_param_1];
|
||||
; CHECK: ld.param.b64 %rd3, [foo_param_1];
|
||||
; CHECK: cvta.to.global.u64 %rd4, %rd3;
|
||||
; CHECK: ld.global.nc.u8 %rs1, [%rd2];
|
||||
; CHECK: ld.global.nc.b8 %rs1, [%rd2];
|
||||
; CHECK: cvt.u32.u8 %r1, %rs1;
|
||||
; CHECK: add.s32 %r2, %r1, 1;
|
||||
; CHECK: and.b32 %r3, %r2, 1;
|
||||
; CHECK: st.global.u32 [%rd4], %r3;
|
||||
; CHECK: st.global.b32 [%rd4], %r3;
|
||||
; CHECK: ret;
|
||||
%ld = load i1, ptr %ptr, align 1
|
||||
%zext = zext i1 %ld to i32
|
||||
|
||||
@@ -11,9 +11,9 @@ define i32 @icmp_i1_eq(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_eq_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_eq_param_0];
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_eq_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_eq_param_1];
|
||||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
|
||||
; CHECK-NEXT: xor.pred %p3, %p1, %p2;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB0_2;
|
||||
@@ -42,9 +42,9 @@ define i32 @icmp_i1_ne(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ne_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ne_param_0];
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ne_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ne_param_1];
|
||||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
|
||||
; CHECK-NEXT: xor.pred %p3, %p1, %p2;
|
||||
; CHECK-NEXT: not.pred %p4, %p3;
|
||||
@@ -74,9 +74,9 @@ define i32 @icmp_i1_sgt(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sgt_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_sgt_param_0];
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sgt_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_sgt_param_1];
|
||||
; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
|
||||
; CHECK-NEXT: or.pred %p3, %p1, %p2;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB2_2;
|
||||
@@ -105,9 +105,9 @@ define i32 @icmp_i1_slt(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_slt_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_slt_param_0];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_slt_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_slt_param_1];
|
||||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
|
||||
; CHECK-NEXT: or.pred %p3, %p2, %p1;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB3_2;
|
||||
@@ -136,9 +136,9 @@ define i32 @icmp_i1_sge(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sge_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_sge_param_0];
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sge_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_sge_param_1];
|
||||
; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
|
||||
; CHECK-NEXT: and.pred %p3, %p1, %p2;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB4_2;
|
||||
@@ -167,9 +167,9 @@ define i32 @icmp_i1_sle(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sle_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_sle_param_0];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sle_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_sle_param_1];
|
||||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
|
||||
; CHECK-NEXT: and.pred %p3, %p2, %p1;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB5_2;
|
||||
@@ -198,9 +198,9 @@ define i32 @icmp_i1_uge(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_uge_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_uge_param_0];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_uge_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_uge_param_1];
|
||||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
|
||||
; CHECK-NEXT: and.pred %p3, %p2, %p1;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB6_2;
|
||||
@@ -229,9 +229,9 @@ define i32 @icmp_i1_ugt(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ugt_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ugt_param_0];
|
||||
; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ugt_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ugt_param_1];
|
||||
; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1;
|
||||
; CHECK-NEXT: or.pred %p3, %p2, %p1;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB7_2;
|
||||
@@ -260,9 +260,9 @@ define i32 @icmp_i1_ule(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ule_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ule_param_0];
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ule_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ule_param_1];
|
||||
; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
|
||||
; CHECK-NEXT: and.pred %p3, %p1, %p2;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB8_2;
|
||||
@@ -291,9 +291,9 @@ define i32 @icmp_i1_ult(i32 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ult_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ult_param_0];
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ult_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ult_param_1];
|
||||
; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2;
|
||||
; CHECK-NEXT: or.pred %p3, %p1, %p2;
|
||||
; CHECK-NEXT: @%p3 bra $L__BB9_2;
|
||||
|
||||
@@ -12,12 +12,12 @@ define void @foo() {
|
||||
; CHECK: .reg .pred %p<2>;
|
||||
; CHECK: .reg .b16 %rs<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK: ld.global.u8 %rs1, [i1g];
|
||||
; CHECK: ld.global.b8 %rs1, [i1g];
|
||||
; CHECK: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK: @%p1 bra $L__BB0_2;
|
||||
; CHECK: mov.b16 %rs3, 1;
|
||||
; CHECK: st.global.u8 [i1g], %rs3;
|
||||
; CHECK: st.global.b8 [i1g], %rs3;
|
||||
; CHECK: ret;
|
||||
%tmp = load i1, ptr addrspace(1) @i1g, align 2
|
||||
br i1 %tmp, label %if.end, label %if.then
|
||||
|
||||
@@ -11,16 +11,16 @@ define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false)
|
||||
; CHECK-NEXT: .reg .b32 %r<10>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_param_0];
|
||||
; CHECK-NEXT: and.b32 %r2, %r1, 1;
|
||||
; CHECK-NEXT: setp.ne.b32 %p1, %r2, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_trunc_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_trunc_param_2];
|
||||
; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_trunc_param_3];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_trunc_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_trunc_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_trunc_param_3];
|
||||
; CHECK-NEXT: selp.b32 %r6, %r3, %r4, %p1;
|
||||
; CHECK-NEXT: and.b32 %r7, %r6, 1;
|
||||
; CHECK-NEXT: setp.ne.b32 %p2, %r7, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r8, [test_select_i1_trunc_param_4];
|
||||
; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_trunc_param_4];
|
||||
; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r9;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -41,16 +41,16 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_select_i1_trunc_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_select_i1_trunc_2_param_0];
|
||||
; CHECK-NEXT: and.b64 %rd2, %rd1, 1;
|
||||
; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0;
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [test_select_i1_trunc_2_param_1];
|
||||
; CHECK-NEXT: ld.param.u16 %rs2, [test_select_i1_trunc_2_param_2];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_2_param_3];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [test_select_i1_trunc_2_param_1];
|
||||
; CHECK-NEXT: ld.param.b16 %rs2, [test_select_i1_trunc_2_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_2_param_3];
|
||||
; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
|
||||
; CHECK-NEXT: and.b16 %rs4, %rs3, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_trunc_2_param_4];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_trunc_2_param_4];
|
||||
; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -69,15 +69,15 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals
|
||||
; CHECK-NEXT: .reg .b32 %r<12>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_param_1];
|
||||
; CHECK-NEXT: or.b32 %r4, %r1, %r2;
|
||||
; CHECK-NEXT: setp.ne.s32 %p1, %r1, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_2];
|
||||
; CHECK-NEXT: setp.eq.s32 %p2, %r5, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r7, [test_select_i1_basic_param_3];
|
||||
; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_param_3];
|
||||
; CHECK-NEXT: setp.eq.s32 %p3, %r4, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r8, [test_select_i1_basic_param_4];
|
||||
; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_basic_param_4];
|
||||
; CHECK-NEXT: selp.b32 %r9, %r7, %r8, %p2;
|
||||
; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1;
|
||||
; CHECK-NEXT: selp.b32 %r11, %r7, %r10, %p3;
|
||||
@@ -98,16 +98,16 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i
|
||||
; CHECK-NEXT: .reg .b32 %r<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_folding_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0];
|
||||
; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_folding_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1];
|
||||
; CHECK-NEXT: setp.ne.s32 %p2, %r2, 0;
|
||||
; CHECK-NEXT: setp.eq.s32 %p3, %r2, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_basic_folding_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2];
|
||||
; CHECK-NEXT: setp.eq.s32 %p4, %r3, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_basic_folding_param_3];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3];
|
||||
; CHECK-NEXT: xor.pred %p6, %p1, %p3;
|
||||
; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_folding_param_4];
|
||||
; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4];
|
||||
; CHECK-NEXT: and.pred %p7, %p6, %p4;
|
||||
; CHECK-NEXT: and.pred %p9, %p2, %p4;
|
||||
; CHECK-NEXT: and.pred %p10, %p3, %p7;
|
||||
|
||||
@@ -8,8 +8,8 @@ define [2 x i128] @foo(i64 %a, i32 %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [foo_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [foo_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: shr.s64 %rd2, %rd1, 63;
|
||||
; CHECK-NEXT: cvt.s64.s32 %rd3, %r1;
|
||||
; CHECK-NEXT: shr.s64 %rd4, %rd3, 63;
|
||||
@@ -30,8 +30,8 @@ define [2 x i128] @foo2(ptr byval([2 x i128]) %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [foo2_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [foo2_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [foo2_param_0+16];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd6};
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -51,8 +51,8 @@ define [2 x i128] @foo3([2 x i128] %a) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [foo3_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [foo3_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo3_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo3_param_0];
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4};
|
||||
; CHECK-NEXT: ret;
|
||||
|
||||
@@ -10,11 +10,11 @@ define i128 @foo(ptr %p, ptr %o) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [foo_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: ld.u8 %rd3, [%rd1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [foo_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0];
|
||||
; CHECK-NEXT: ld.b8 %rd3, [%rd1];
|
||||
; CHECK-NEXT: mov.b64 %rd4, 0;
|
||||
; CHECK-NEXT: st.v2.u64 [%rd2], {%rd3, %rd4};
|
||||
; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4};
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4};
|
||||
; CHECK-NEXT: ret;
|
||||
%c = load i8, ptr %p, align 1
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16],
|
||||
; CHECK-NEXT: .param .align 16 .b8 callee_param_1[16],
|
||||
define void @callee(i128, i128, ptr) {
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1];
|
||||
|
||||
; CHECK: mul.lo.s64 %[[REG4:rd[0-9]+]], %[[REG0]], %[[REG3]];
|
||||
; CHECK-NEXT: mul.hi.u64 %[[REG5:rd[0-9]+]], %[[REG0]], %[[REG2]];
|
||||
@@ -25,8 +25,8 @@ define void @callee(i128, i128, ptr) {
|
||||
; CHECK-NEXT: .param .align 16 .b8 caller_kernel_param_1[16],
|
||||
define ptx_kernel void @caller_kernel(i128, i128, ptr) {
|
||||
start:
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0];
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0];
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1];
|
||||
|
||||
; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
|
||||
; CHECK: .param .align 16 .b8 param0[16];
|
||||
@@ -44,8 +44,8 @@ start:
|
||||
; CHECK-NEXT: .param .align 16 .b8 caller_func_param_1[16],
|
||||
define void @caller_func(i128, i128, ptr) {
|
||||
start:
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0]
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0]
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1]
|
||||
|
||||
; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0
|
||||
; CHECK: .param .align 16 .b8 param0[16];
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
|
||||
; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee(
|
||||
define i128 @callee(i128) {
|
||||
; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
|
||||
; CHECK: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0];
|
||||
; CHECK: st.param.v2.b64 [func_retval0], {%[[REG0]], %[[REG1]]}
|
||||
ret i128 %0
|
||||
}
|
||||
@@ -11,8 +11,8 @@ define i128 @callee(i128) {
|
||||
; CHECK-LABEL: .visible .func caller(
|
||||
define void @caller(i128, ptr) {
|
||||
start:
|
||||
; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0];
|
||||
; CHECK-DAG: ld.param.u64 %[[OUT:rd[0-9]+]], [caller_param_1];
|
||||
; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0];
|
||||
; CHECK-DAG: ld.param.b64 %[[OUT:rd[0-9]+]], [caller_param_1];
|
||||
|
||||
; CHECK: { // callseq 0, 0
|
||||
; CHECK: .param .align 16 .b8 retval0[16];
|
||||
@@ -21,7 +21,7 @@ start:
|
||||
; CHECK: } // callseq 0
|
||||
%a = call i128 @callee(i128 %0)
|
||||
|
||||
; CHECK-DAG: st.v2.u64 [%[[OUT]]], {%[[REG2]], %[[REG3]]};
|
||||
; CHECK-DAG: st.v2.b64 [%[[OUT]]], {%[[REG2]], %[[REG3]]};
|
||||
store i128 %a, ptr %1
|
||||
|
||||
ret void
|
||||
|
||||
@@ -10,8 +10,8 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<127>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [srem_i128_param_1];
|
||||
; CHECK-NEXT: shr.s64 %rd2, %rd46, 63;
|
||||
; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
|
||||
; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
|
||||
@@ -151,8 +151,8 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<113>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [urem_i128_param_1];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [urem_i128_param_1];
|
||||
; CHECK-NEXT: or.b64 %rd45, %rd3, %rd4;
|
||||
; CHECK-NEXT: setp.eq.s64 %p1, %rd45, 0;
|
||||
; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42;
|
||||
@@ -275,7 +275,7 @@ define i128 @srem_i128_pow2k(i128 %lhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<10>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [srem_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [srem_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: shr.s64 %rd3, %rd2, 63;
|
||||
; CHECK-NEXT: shr.u64 %rd4, %rd3, 31;
|
||||
; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd4;
|
||||
@@ -295,7 +295,7 @@ define i128 @urem_i128_pow2k(i128 %lhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: and.b64 %rd3, %rd1, 8589934591;
|
||||
; CHECK-NEXT: mov.b64 %rd4, 0;
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4};
|
||||
@@ -312,8 +312,8 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<122>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [sdiv_i128_param_1];
|
||||
; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45;
|
||||
; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46;
|
||||
; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0;
|
||||
@@ -448,8 +448,8 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<107>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %_udiv-special-cases
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [udiv_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd43, %rd44}, [udiv_i128_param_1];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd43, %rd44}, [udiv_i128_param_1];
|
||||
; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44;
|
||||
; CHECK-NEXT: setp.eq.s64 %p1, %rd45, 0;
|
||||
; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42;
|
||||
@@ -566,7 +566,7 @@ define i128 @sdiv_i128_pow2k(i128 %lhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<11>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [sdiv_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [sdiv_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: shr.s64 %rd3, %rd2, 63;
|
||||
; CHECK-NEXT: shr.u64 %rd4, %rd3, 31;
|
||||
; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd4;
|
||||
@@ -587,7 +587,7 @@ define i128 @udiv_i128_pow2k(i128 %lhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [udiv_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [udiv_i128_pow2k_param_0];
|
||||
; CHECK-NEXT: shl.b64 %rd3, %rd2, 31;
|
||||
; CHECK-NEXT: shr.u64 %rd4, %rd1, 33;
|
||||
; CHECK-NEXT: or.b64 %rd5, %rd4, %rd3;
|
||||
@@ -604,8 +604,8 @@ define i128 @add_i128(i128 %lhs, i128 %rhs) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<7>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [add_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [add_i128_param_1];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [add_i128_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [add_i128_param_1];
|
||||
; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd3;
|
||||
; CHECK-NEXT: addc.cc.s64 %rd6, %rd2, %rd4;
|
||||
; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd6};
|
||||
|
||||
@@ -39,7 +39,7 @@ define i16 @test_extract_0(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_0_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_extract_0_param_0];
|
||||
; I16x2-NEXT: mov.b32 {%rs1, _}, %r1;
|
||||
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; }
|
||||
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
|
||||
@@ -56,7 +56,7 @@ define i16 @test_extract_1(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_1_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_extract_1_param_0];
|
||||
; I16x2-NEXT: mov.b32 {_, %rs1}, %r1;
|
||||
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; }
|
||||
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
|
||||
@@ -75,8 +75,8 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 {
|
||||
; COMMON-NEXT: .reg .b64 %rd<2>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_extract_i_param_0];
|
||||
; COMMON-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_extract_i_param_0];
|
||||
; COMMON-NEXT: setp.eq.s64 %p1, %rd1, 0;
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1;
|
||||
@@ -93,8 +93,8 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1];
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0];
|
||||
; I16x2-NEXT: add.s16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; I16x2-NEXT: ret;
|
||||
@@ -105,8 +105,8 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2;
|
||||
@@ -125,7 +125,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0];
|
||||
; I16x2-NEXT: mov.b32 %r2, 131073;
|
||||
; I16x2-NEXT: add.s16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -137,7 +137,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<3>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2;
|
||||
; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1;
|
||||
@@ -154,7 +154,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0];
|
||||
; I16x2-NEXT: mov.b32 %r2, 131073;
|
||||
; I16x2-NEXT: add.s16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -166,7 +166,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<3>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2;
|
||||
; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1;
|
||||
@@ -184,8 +184,8 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_sub_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_sub_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_sub_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_sub_param_0];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2;
|
||||
@@ -203,8 +203,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1];
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0];
|
||||
; I16x2-NEXT: max.s16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; I16x2-NEXT: ret;
|
||||
@@ -215,8 +215,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2;
|
||||
@@ -235,8 +235,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1];
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0];
|
||||
; I16x2-NEXT: max.u16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; I16x2-NEXT: ret;
|
||||
@@ -247,8 +247,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2;
|
||||
@@ -267,8 +267,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1];
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0];
|
||||
; I16x2-NEXT: min.s16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; I16x2-NEXT: ret;
|
||||
@@ -279,8 +279,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2;
|
||||
@@ -299,8 +299,8 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1];
|
||||
; I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0];
|
||||
; I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1];
|
||||
; I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0];
|
||||
; I16x2-NEXT: min.u16x2 %r3, %r1, %r2;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; I16x2-NEXT: ret;
|
||||
@@ -311,8 +311,8 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b32 %r<4>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0];
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2;
|
||||
@@ -332,8 +332,8 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_mul_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_mul_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_mul_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_mul_param_0];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1;
|
||||
; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2;
|
||||
@@ -352,8 +352,8 @@ define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_or_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_or_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_or_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_or_param_0];
|
||||
; COMMON-NEXT: or.b32 %r3, %r1, %r2;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -370,7 +370,7 @@ define <2 x i16> @test_or_computed(i16 %a) {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u16 %rs1, [test_or_computed_param_0];
|
||||
; COMMON-NEXT: ld.param.b16 %rs1, [test_or_computed_param_0];
|
||||
; COMMON-NEXT: mov.b16 %rs2, 0;
|
||||
; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2};
|
||||
; COMMON-NEXT: mov.b16 %rs3, 5;
|
||||
@@ -391,7 +391,7 @@ define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_or_imm_0_param_0];
|
||||
; COMMON-NEXT: or.b32 %r2, %r1, 131073;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -405,7 +405,7 @@ define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_or_imm_1_param_0];
|
||||
; COMMON-NEXT: or.b32 %r2, %r1, 131073;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -419,8 +419,8 @@ define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_xor_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_xor_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_xor_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_xor_param_0];
|
||||
; COMMON-NEXT: xor.b32 %r3, %r1, %r2;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -435,7 +435,7 @@ define <2 x i16> @test_xor_computed(i16 %a) {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u16 %rs1, [test_xor_computed_param_0];
|
||||
; COMMON-NEXT: ld.param.b16 %rs1, [test_xor_computed_param_0];
|
||||
; COMMON-NEXT: mov.b16 %rs2, 0;
|
||||
; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2};
|
||||
; COMMON-NEXT: mov.b16 %rs3, 5;
|
||||
@@ -456,7 +456,7 @@ define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_xor_imm_0_param_0];
|
||||
; COMMON-NEXT: xor.b32 %r2, %r1, 131073;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -470,7 +470,7 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_xor_imm_1_param_0];
|
||||
; COMMON-NEXT: xor.b32 %r2, %r1, 131073;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -484,8 +484,8 @@ define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_and_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_and_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_and_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_and_param_0];
|
||||
; COMMON-NEXT: and.b32 %r3, %r1, %r2;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -502,7 +502,7 @@ define <2 x i16> @test_and_computed(i16 %a) {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u16 %rs1, [test_and_computed_param_0];
|
||||
; COMMON-NEXT: ld.param.b16 %rs1, [test_and_computed_param_0];
|
||||
; COMMON-NEXT: mov.b16 %rs2, 0;
|
||||
; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2};
|
||||
; COMMON-NEXT: mov.b16 %rs3, 5;
|
||||
@@ -523,7 +523,7 @@ define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_and_imm_0_param_0];
|
||||
; COMMON-NEXT: and.b32 %r2, %r1, 131073;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -537,7 +537,7 @@ define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_and_imm_1_param_0];
|
||||
; COMMON-NEXT: and.b32 %r2, %r1, 131073;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -552,10 +552,10 @@ define void @test_ldst_v2i16(ptr %a, ptr %b) {
|
||||
; COMMON-NEXT: .reg .b64 %rd<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v2i16_param_1];
|
||||
; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v2i16_param_0];
|
||||
; COMMON-NEXT: ld.u32 %r1, [%rd1];
|
||||
; COMMON-NEXT: st.u32 [%rd2], %r1;
|
||||
; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v2i16_param_1];
|
||||
; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v2i16_param_0];
|
||||
; COMMON-NEXT: ld.b32 %r1, [%rd1];
|
||||
; COMMON-NEXT: st.b32 [%rd2], %r1;
|
||||
; COMMON-NEXT: ret;
|
||||
%t1 = load <2 x i16>, ptr %a
|
||||
store <2 x i16> %t1, ptr %b, align 16
|
||||
@@ -572,12 +572,12 @@ define void @test_ldst_v3i16(ptr %a, ptr %b) {
|
||||
; COMMON-NEXT: .reg .b64 %rd<5>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v3i16_param_1];
|
||||
; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v3i16_param_0];
|
||||
; COMMON-NEXT: ld.u64 %rd3, [%rd1];
|
||||
; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v3i16_param_1];
|
||||
; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v3i16_param_0];
|
||||
; COMMON-NEXT: ld.b64 %rd3, [%rd1];
|
||||
; COMMON-NEXT: shr.u64 %rd4, %rd3, 32;
|
||||
; COMMON-NEXT: st.u32 [%rd2], %rd3;
|
||||
; COMMON-NEXT: st.u16 [%rd2+4], %rd4;
|
||||
; COMMON-NEXT: st.b32 [%rd2], %rd3;
|
||||
; COMMON-NEXT: st.b16 [%rd2+4], %rd4;
|
||||
; COMMON-NEXT: ret;
|
||||
%t1 = load <3 x i16>, ptr %a
|
||||
store <3 x i16> %t1, ptr %b, align 16
|
||||
@@ -591,10 +591,10 @@ define void @test_ldst_v4i16(ptr %a, ptr %b) {
|
||||
; COMMON-NEXT: .reg .b64 %rd<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v4i16_param_1];
|
||||
; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v4i16_param_0];
|
||||
; COMMON-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
|
||||
; COMMON-NEXT: st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
|
||||
; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v4i16_param_1];
|
||||
; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v4i16_param_0];
|
||||
; COMMON-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1];
|
||||
; COMMON-NEXT: st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4};
|
||||
; COMMON-NEXT: ret;
|
||||
%t1 = load <4 x i16>, ptr %a
|
||||
store <4 x i16> %t1, ptr %b, align 16
|
||||
@@ -608,8 +608,8 @@ define void @test_ldst_v8i16(ptr %a, ptr %b) {
|
||||
; COMMON-NEXT: .reg .b64 %rd<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v8i16_param_1];
|
||||
; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v8i16_param_0];
|
||||
; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v8i16_param_1];
|
||||
; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v8i16_param_0];
|
||||
; COMMON-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
|
||||
; COMMON-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4};
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -626,8 +626,8 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<5>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_call_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_call_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_call_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_call_param_0];
|
||||
; COMMON-NEXT: { // callseq 0, 0
|
||||
; COMMON-NEXT: .param .align 4 .b8 param0[4];
|
||||
; COMMON-NEXT: st.param.b32 [param0], %r1;
|
||||
@@ -654,8 +654,8 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<5>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0];
|
||||
; COMMON-NEXT: { // callseq 1, 0
|
||||
; COMMON-NEXT: .param .align 4 .b8 param0[4];
|
||||
; COMMON-NEXT: st.param.b32 [param0], %r2;
|
||||
@@ -682,8 +682,8 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<5>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0];
|
||||
; COMMON-NEXT: { // callseq 2, 0
|
||||
; COMMON-NEXT: .param .align 4 .b8 param0[4];
|
||||
; COMMON-NEXT: st.param.b32 [param0], %r2;
|
||||
@@ -712,11 +712,11 @@ define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u8 %rs1, [test_select_param_2];
|
||||
; COMMON-NEXT: ld.param.b8 %rs1, [test_select_param_2];
|
||||
; COMMON-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; COMMON-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_select_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_select_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_select_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_select_param_0];
|
||||
; COMMON-NEXT: selp.b32 %r3, %r1, %r2, %p1;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -732,10 +732,10 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x
|
||||
; COMMON-NEXT: .reg .b32 %r<6>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r4, [test_select_cc_param_3];
|
||||
; COMMON-NEXT: ld.param.u32 %r3, [test_select_cc_param_2];
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r4, [test_select_cc_param_3];
|
||||
; COMMON-NEXT: ld.param.b32 %r3, [test_select_cc_param_2];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_param_0];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4;
|
||||
; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3;
|
||||
; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1;
|
||||
@@ -760,10 +760,10 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b,
|
||||
; COMMON-NEXT: .reg .b32 %r<9>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
|
||||
; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
|
||||
; COMMON-NEXT: ld.param.u32 %r6, [test_select_cc_i32_i16_param_3];
|
||||
; COMMON-NEXT: ld.param.u32 %r5, [test_select_cc_i32_i16_param_2];
|
||||
; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1];
|
||||
; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r6, [test_select_cc_i32_i16_param_3];
|
||||
; COMMON-NEXT: ld.param.b32 %r5, [test_select_cc_i32_i16_param_2];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6;
|
||||
; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5;
|
||||
; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1;
|
||||
@@ -786,10 +786,10 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b,
|
||||
; COMMON-NEXT: .reg .b32 %r<8>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
|
||||
; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
|
||||
; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_i16_i32_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_i16_i32_param_0];
|
||||
; COMMON-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3];
|
||||
; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2];
|
||||
; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_i16_i32_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_i16_i32_param_0];
|
||||
; COMMON-NEXT: setp.ne.s32 %p1, %r3, %r5;
|
||||
; COMMON-NEXT: setp.ne.s32 %p2, %r4, %r6;
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
@@ -812,7 +812,7 @@ define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0];
|
||||
; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0];
|
||||
; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; COMMON-NEXT: ret;
|
||||
@@ -827,12 +827,12 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
|
||||
; I16x2-NEXT: .reg .b64 %rd<2>;
|
||||
; I16x2-EMPTY:
|
||||
; I16x2-NEXT: // %bb.0:
|
||||
; I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
|
||||
; I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
|
||||
; I16x2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
|
||||
; I16x2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
|
||||
; I16x2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
|
||||
; I16x2-NEXT: mov.b32 %r4, 65537;
|
||||
; I16x2-NEXT: add.s16x2 %r5, %r3, %r4;
|
||||
; I16x2-NEXT: st.u32 [%rd1], %r5;
|
||||
; I16x2-NEXT: st.b32 [%rd1], %r5;
|
||||
; I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; I16x2-NEXT: ret;
|
||||
;
|
||||
@@ -843,15 +843,15 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 {
|
||||
; NO-I16x2-NEXT: .reg .b64 %rd<2>;
|
||||
; NO-I16x2-EMPTY:
|
||||
; NO-I16x2-NEXT: // %bb.0:
|
||||
; NO-I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
|
||||
; NO-I16x2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0];
|
||||
; NO-I16x2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use0_param_1];
|
||||
; NO-I16x2-NEXT: cvt.u16.u32 %rs1, %r2;
|
||||
; NO-I16x2-NEXT: cvt.u16.u32 %rs2, %r1;
|
||||
; NO-I16x2-NEXT: mov.b32 %r3, {%rs2, %rs1};
|
||||
; NO-I16x2-NEXT: add.s16 %rs3, %rs1, 1;
|
||||
; NO-I16x2-NEXT: add.s16 %rs4, %rs2, 1;
|
||||
; NO-I16x2-NEXT: mov.b32 %r4, {%rs4, %rs3};
|
||||
; NO-I16x2-NEXT: st.u32 [%rd1], %r4;
|
||||
; NO-I16x2-NEXT: st.b32 [%rd1], %r4;
|
||||
; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; NO-I16x2-NEXT: ret;
|
||||
%r = trunc <2 x i32> %a to <2 x i16>
|
||||
@@ -869,12 +869,12 @@ define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 {
|
||||
; COMMON-NEXT: .reg .b64 %rd<2>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0];
|
||||
; COMMON-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1];
|
||||
; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0];
|
||||
; COMMON-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use1_param_1];
|
||||
; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U;
|
||||
; COMMON-NEXT: add.s32 %r4, %r2, 1;
|
||||
; COMMON-NEXT: add.s32 %r5, %r1, 1;
|
||||
; COMMON-NEXT: st.v2.u32 [%rd1], {%r5, %r4};
|
||||
; COMMON-NEXT: st.v2.b32 [%rd1], {%r5, %r4};
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; COMMON-NEXT: ret;
|
||||
%r = trunc <2 x i32> %a to <2 x i16>
|
||||
@@ -893,7 +893,7 @@ define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b64 %rd<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
|
||||
; COMMON-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
|
||||
; COMMON-NEXT: cvt.u16.u64 %rs1, %rd2;
|
||||
; COMMON-NEXT: cvt.u16.u64 %rs2, %rd1;
|
||||
; COMMON-NEXT: mov.b32 %r1, {%rs2, %rs1};
|
||||
@@ -910,7 +910,7 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<4>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; COMMON-NEXT: cvt.u32.u16 %r2, %rs1;
|
||||
; COMMON-NEXT: cvt.u32.u16 %r3, %rs2;
|
||||
@@ -928,7 +928,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b64 %rd<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2;
|
||||
; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1;
|
||||
@@ -944,7 +944,7 @@ define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<2>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_bitcast_i32_to_2xi16_param_0];
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; COMMON-NEXT: ret;
|
||||
%r = bitcast i32 %a to <2 x i16>
|
||||
@@ -957,7 +957,7 @@ define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<2>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_bitcast_2xi16_to_i32_param_0];
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; COMMON-NEXT: ret;
|
||||
%r = bitcast <2 x i16> %a to i32
|
||||
@@ -971,7 +971,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<2>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
|
||||
; COMMON-NEXT: ld.param.b16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0];
|
||||
; COMMON-NEXT: mov.b16 %rs2, 5;
|
||||
; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2};
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
@@ -990,7 +990,7 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0];
|
||||
; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1};
|
||||
; COMMON-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
@@ -1006,8 +1006,8 @@ define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 {
|
||||
; COMMON-NEXT: .reg .b32 %r<3>;
|
||||
; COMMON-EMPTY:
|
||||
; COMMON-NEXT: // %bb.0:
|
||||
; COMMON-NEXT: ld.param.u16 %rs1, [test_insertelement_param_1];
|
||||
; COMMON-NEXT: ld.param.u32 %r1, [test_insertelement_param_0];
|
||||
; COMMON-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1];
|
||||
; COMMON-NEXT: ld.param.b32 %r1, [test_insertelement_param_0];
|
||||
; I16x2-NEXT: mov.b32 {%rs2, _}, %r1;
|
||||
; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; }
|
||||
; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1};
|
||||
|
||||
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
|
||||
|
||||
; CHECK: .visible .func (.param .b32 func_retval0) callee
|
||||
define i8 @callee(i8 %a) {
|
||||
; CHECK: ld.param.u8
|
||||
; CHECK: ld.param.b8
|
||||
%ret = add i8 %a, 42
|
||||
; CHECK: st.param.b32
|
||||
ret i8 %ret
|
||||
@@ -13,7 +13,7 @@ define i8 @callee(i8 %a) {
|
||||
|
||||
; CHECK: .visible .func caller
|
||||
define void @caller(ptr %a) {
|
||||
; CHECK: ld.u8
|
||||
; CHECK: ld.b8
|
||||
%val = load i8, ptr %a
|
||||
%ret = tail call i8 @callee(i8 %val)
|
||||
; CHECK: ld.param.b32
|
||||
|
||||
@@ -17,7 +17,7 @@ define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xi8_i16_param_0];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1;
|
||||
; CHECK-NEXT: shl.b16 %rs3, %rs2, 8;
|
||||
; CHECK-NEXT: and.b16 %rs4, %rs1, 255;
|
||||
@@ -36,7 +36,7 @@ define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0];
|
||||
; CHECK-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0];
|
||||
; CHECK-NEXT: shr.u16 %rs2, %rs1, 8;
|
||||
; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2};
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
|
||||
@@ -29,7 +29,7 @@ define i8 @test_extract_0(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -43,7 +43,7 @@ define i8 @test_extract_1(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -57,7 +57,7 @@ define i8 @test_extract_2(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_2_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -71,7 +71,7 @@ define i8 @test_extract_3(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_3_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -86,8 +86,8 @@ define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %rd<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_extract_i_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0];
|
||||
; CHECK-NEXT: cvt.u32.u64 %r2, %rd1;
|
||||
; CHECK-NEXT: shl.b32 %r3, %r2, 3;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8;
|
||||
@@ -104,8 +104,8 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<18>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_add_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_add_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
|
||||
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
|
||||
@@ -146,7 +146,7 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<13>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
|
||||
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
|
||||
; CHECK-NEXT: add.s16 %rs2, %rs1, 4;
|
||||
@@ -179,7 +179,7 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<13>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
|
||||
; CHECK-NEXT: cvt.u16.u32 %rs1, %r2;
|
||||
; CHECK-NEXT: add.s16 %rs2, %rs1, 4;
|
||||
@@ -212,8 +212,8 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<18>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_sub_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_sub_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
|
||||
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
|
||||
@@ -254,8 +254,8 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<26>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_smax_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_smax_param_0];
|
||||
; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3;
|
||||
@@ -297,8 +297,8 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<18>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_umax_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_umax_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3;
|
||||
@@ -332,8 +332,8 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<26>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_smin_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_smin_param_0];
|
||||
; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3;
|
||||
@@ -375,8 +375,8 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<18>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_umin_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_umin_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3;
|
||||
@@ -410,9 +410,9 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<23>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_eq_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_eq_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_eq_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4;
|
||||
@@ -450,9 +450,9 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<23>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_ne_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_ne_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_ne_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4;
|
||||
@@ -490,8 +490,8 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<18>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_mul_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_mul_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8;
|
||||
; CHECK-NEXT: cvt.u16.u32 %rs1, %r3;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8;
|
||||
@@ -531,8 +531,8 @@ define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_or_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_or_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_or_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_or_param_0];
|
||||
; CHECK-NEXT: or.b32 %r3, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -547,7 +547,7 @@ define <4 x i8> @test_or_computed(i8 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_or_computed_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r1, 0;
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
|
||||
@@ -569,7 +569,7 @@ define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_or_imm_0_param_0];
|
||||
; CHECK-NEXT: or.b32 %r2, %r1, 67305985;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -583,7 +583,7 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_or_imm_1_param_0];
|
||||
; CHECK-NEXT: or.b32 %r2, %r1, 67305985;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -597,8 +597,8 @@ define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_xor_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_xor_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_xor_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_xor_param_0];
|
||||
; CHECK-NEXT: xor.b32 %r3, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -613,7 +613,7 @@ define <4 x i8> @test_xor_computed(i8 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_xor_computed_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r1, 0;
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
|
||||
@@ -635,7 +635,7 @@ define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_xor_imm_0_param_0];
|
||||
; CHECK-NEXT: xor.b32 %r2, %r1, 67305985;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -649,7 +649,7 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_xor_imm_1_param_0];
|
||||
; CHECK-NEXT: xor.b32 %r2, %r1, 67305985;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -663,8 +663,8 @@ define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_and_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_and_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_and_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_and_param_0];
|
||||
; CHECK-NEXT: and.b32 %r3, %r1, %r2;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -679,7 +679,7 @@ define <4 x i8> @test_and_computed(i8 %a) {
|
||||
; CHECK-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_and_computed_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r1, 0;
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
|
||||
@@ -701,7 +701,7 @@ define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_and_imm_0_param_0];
|
||||
; CHECK-NEXT: and.b32 %r2, %r1, 67305985;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -715,7 +715,7 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_and_imm_1_param_0];
|
||||
; CHECK-NEXT: and.b32 %r2, %r1, 67305985;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -730,10 +730,10 @@ define void @test_ldst_v2i8(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2i8_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.u32 [%rd2], %r1;
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2i8_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.b32 [%rd2], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%t1 = load <4 x i8>, ptr %a
|
||||
store <4 x i8> %t1, ptr %b, align 16
|
||||
@@ -747,12 +747,12 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.u16 [%rd2], %r1;
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3i8_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.b16 [%rd2], %r1;
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8;
|
||||
; CHECK-NEXT: st.u8 [%rd2+2], %r2;
|
||||
; CHECK-NEXT: st.b8 [%rd2+2], %r2;
|
||||
; CHECK-NEXT: ret;
|
||||
%t1 = load <3 x i8>, ptr %a
|
||||
store <3 x i8> %t1, ptr %b, align 16
|
||||
@@ -766,10 +766,10 @@ define void @test_ldst_v4i8(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.u32 [%rd2], %r1;
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: st.b32 [%rd2], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%t1 = load <4 x i8>, ptr %a
|
||||
store <4 x i8> %t1, ptr %b, align 16
|
||||
@@ -783,16 +783,16 @@ define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0];
|
||||
; CHECK-NEXT: ld.u8 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.u8 %r2, [%rd1+1];
|
||||
; CHECK-NEXT: ld.u8 %r3, [%rd1+2];
|
||||
; CHECK-NEXT: ld.u8 %r4, [%rd1+3];
|
||||
; CHECK-NEXT: st.u8 [%rd2+3], %r4;
|
||||
; CHECK-NEXT: st.u8 [%rd2+2], %r3;
|
||||
; CHECK-NEXT: st.u8 [%rd2+1], %r2;
|
||||
; CHECK-NEXT: st.u8 [%rd2], %r1;
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0];
|
||||
; CHECK-NEXT: ld.b8 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.b8 %r2, [%rd1+1];
|
||||
; CHECK-NEXT: ld.b8 %r3, [%rd1+2];
|
||||
; CHECK-NEXT: ld.b8 %r4, [%rd1+3];
|
||||
; CHECK-NEXT: st.b8 [%rd2+3], %r4;
|
||||
; CHECK-NEXT: st.b8 [%rd2+2], %r3;
|
||||
; CHECK-NEXT: st.b8 [%rd2+1], %r2;
|
||||
; CHECK-NEXT: st.b8 [%rd2], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%t1 = load <4 x i8>, ptr %a, align 1
|
||||
store <4 x i8> %t1, ptr %b, align 1
|
||||
@@ -807,8 +807,8 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<3>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8i8_param_0];
|
||||
; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1];
|
||||
; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2};
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -825,8 +825,8 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0];
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 4 .b8 param0[4];
|
||||
; CHECK-NEXT: st.param.b32 [param0], %r1;
|
||||
@@ -853,8 +853,8 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0];
|
||||
; CHECK-NEXT: { // callseq 1, 0
|
||||
; CHECK-NEXT: .param .align 4 .b8 param0[4];
|
||||
; CHECK-NEXT: st.param.b32 [param0], %r2;
|
||||
@@ -881,8 +881,8 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0];
|
||||
; CHECK-NEXT: { // callseq 2, 0
|
||||
; CHECK-NEXT: .param .align 4 .b8 param0[4];
|
||||
; CHECK-NEXT: st.param.b32 [param0], %r2;
|
||||
@@ -911,11 +911,11 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2];
|
||||
; CHECK-NEXT: and.b16 %rs2, %rs1, 1;
|
||||
; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0;
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0];
|
||||
; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -930,10 +930,10 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8>
|
||||
; CHECK-NEXT: .reg .b32 %r<28>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3];
|
||||
; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r4, [test_select_cc_param_3];
|
||||
; CHECK-NEXT: ld.param.b32 %r3, [test_select_cc_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8;
|
||||
; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5;
|
||||
@@ -975,10 +975,10 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b,
|
||||
; CHECK-NEXT: .reg .b32 %r<23>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
|
||||
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
|
||||
; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3];
|
||||
; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r10, [test_select_cc_i32_i8_param_3];
|
||||
; CHECK-NEXT: ld.param.b32 %r9, [test_select_cc_i32_i8_param_2];
|
||||
; CHECK-NEXT: bfe.u32 %r11, %r10, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r12, %r9, 0, 8;
|
||||
; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11;
|
||||
@@ -1010,10 +1010,10 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b,
|
||||
; CHECK-NEXT: .reg .b32 %r<26>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
|
||||
; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_i8_i32_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_i8_i32_param_0];
|
||||
; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7;
|
||||
; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8;
|
||||
; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9;
|
||||
@@ -1048,7 +1048,7 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<8>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
|
||||
; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0];
|
||||
; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 0x3340U;
|
||||
; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 0x3340U;
|
||||
; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U;
|
||||
@@ -1065,8 +1065,8 @@ define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %rd<5>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16];
|
||||
; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0];
|
||||
; CHECK-NEXT: cvt.u32.u64 %r1, %rd4;
|
||||
; CHECK-NEXT: cvt.u32.u64 %r2, %rd3;
|
||||
; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U;
|
||||
@@ -1086,7 +1086,7 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8;
|
||||
@@ -1104,7 +1104,7 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b64 %rd<9>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0];
|
||||
; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8;
|
||||
; CHECK-NEXT: cvt.u64.u32 %rd1, %r2;
|
||||
; CHECK-NEXT: and.b64 %rd2, %rd1, 255;
|
||||
@@ -1130,7 +1130,7 @@ define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_i32_to_4xi8_param_0];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast i32 %a to <4 x i8>
|
||||
@@ -1144,7 +1144,7 @@ define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %f1, [test_bitcast_float_to_4xi8_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r1, %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -1158,7 +1158,7 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_4xi8_to_i32_param_0];
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast <4 x i8> %a to i32
|
||||
@@ -1172,9 +1172,9 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %f<2>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_4xi8_to_float_param_0];
|
||||
; CHECK-NEXT: mov.b32 %f1, %r1;
|
||||
; CHECK-NEXT: st.param.f32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %f1;
|
||||
; CHECK-NEXT: ret;
|
||||
%r = bitcast <4 x i8> %a to float
|
||||
ret float %r
|
||||
@@ -1188,7 +1188,7 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<6>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0];
|
||||
; CHECK-NEXT: mov.b32 %r1, 6;
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 0x3340U;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r3, %rs1;
|
||||
@@ -1211,7 +1211,7 @@ define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0];
|
||||
; CHECK-NEXT: // implicit-def: %r3
|
||||
; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 0x123U;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
|
||||
@@ -1226,8 +1226,8 @@ define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0];
|
||||
; CHECK-NEXT: ld.param.b32 %r2, [test_shufflevector_2_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_2_param_0];
|
||||
; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 0x2537U;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
; CHECK-NEXT: ret;
|
||||
@@ -1243,8 +1243,8 @@ define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1];
|
||||
; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0];
|
||||
; CHECK-NEXT: ld.param.b8 %rs1, [test_insertelement_param_1];
|
||||
; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0];
|
||||
; CHECK-NEXT: cvt.u32.u16 %r2, %rs1;
|
||||
; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8;
|
||||
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
|
||||
@@ -1260,7 +1260,7 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<12>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2;
|
||||
; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1;
|
||||
@@ -1291,7 +1291,7 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 {
|
||||
; CHECK-NEXT: .reg .b32 %r<12>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0:
|
||||
; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
|
||||
; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0];
|
||||
; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2;
|
||||
; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2;
|
||||
; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1;
|
||||
@@ -1323,11 +1323,11 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v4i8_param_2];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v4i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.u32 %r2, [%rd2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [test_srem_v4i8_param_2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_srem_v4i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.b32 %r2, [%rd2];
|
||||
; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8;
|
||||
; CHECK-NEXT: cvt.s8.s32 %rs1, %r3;
|
||||
; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8;
|
||||
@@ -1355,7 +1355,7 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) {
|
||||
; CHECK-NEXT: cvt.u32.u16 %r15, %rs12;
|
||||
; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U;
|
||||
; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U;
|
||||
; CHECK-NEXT: st.u32 [%rd3], %r17;
|
||||
; CHECK-NEXT: st.b32 [%rd3], %r17;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%t57 = load <4 x i8>, ptr %a, align 4
|
||||
@@ -1379,17 +1379,17 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v3i8_param_2];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v3i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v3i8_param_0];
|
||||
; CHECK-NEXT: ld.u8 %rs1, [%rd1];
|
||||
; CHECK-NEXT: ld.u8 %rs2, [%rd1+1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_srem_v3i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v3i8_param_0];
|
||||
; CHECK-NEXT: ld.b8 %rs1, [%rd1];
|
||||
; CHECK-NEXT: ld.b8 %rs2, [%rd1+1];
|
||||
; CHECK-NEXT: shl.b16 %rs3, %rs2, 8;
|
||||
; CHECK-NEXT: or.b16 %rs4, %rs3, %rs1;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r1, %rs4;
|
||||
; CHECK-NEXT: ld.s8 %rs5, [%rd1+2];
|
||||
; CHECK-NEXT: ld.u8 %rs6, [%rd2];
|
||||
; CHECK-NEXT: ld.u8 %rs7, [%rd2+1];
|
||||
; CHECK-NEXT: ld.b8 %rs6, [%rd2];
|
||||
; CHECK-NEXT: ld.b8 %rs7, [%rd2+1];
|
||||
; CHECK-NEXT: shl.b16 %rs8, %rs7, 8;
|
||||
; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6;
|
||||
; CHECK-NEXT: cvt.u32.u16 %r2, %rs9;
|
||||
@@ -1413,10 +1413,10 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) {
|
||||
; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U;
|
||||
; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10;
|
||||
; CHECK-NEXT: mov.b32 {%rs18, _}, %r13;
|
||||
; CHECK-NEXT: st.u8 [%rd3], %rs18;
|
||||
; CHECK-NEXT: st.b8 [%rd3], %rs18;
|
||||
; CHECK-NEXT: shr.u16 %rs19, %rs18, 8;
|
||||
; CHECK-NEXT: st.u8 [%rd3+1], %rs19;
|
||||
; CHECK-NEXT: st.u8 [%rd3+2], %rs17;
|
||||
; CHECK-NEXT: st.b8 [%rd3+1], %rs19;
|
||||
; CHECK-NEXT: st.b8 [%rd3+2], %rs17;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%t57 = load <3 x i8>, ptr %a, align 1
|
||||
@@ -1434,11 +1434,11 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
|
||||
; CHECK-NEXT: .reg .b64 %rd<4>;
|
||||
; CHECK-EMPTY:
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: ld.param.u64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
|
||||
; CHECK-NEXT: ld.param.u64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
|
||||
; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.u32 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.u32 %r2, [%rd2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2];
|
||||
; CHECK-NEXT: ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1];
|
||||
; CHECK-NEXT: ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0];
|
||||
; CHECK-NEXT: ld.b32 %r1, [%rd1];
|
||||
; CHECK-NEXT: ld.b32 %r2, [%rd2];
|
||||
; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8;
|
||||
; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8;
|
||||
; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3;
|
||||
@@ -1458,7 +1458,7 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) {
|
||||
; CHECK-NEXT: selp.b32 %r15, -1, 0, %p1;
|
||||
; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U;
|
||||
; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U;
|
||||
; CHECK-NEXT: st.u32 [%rd3], %r17;
|
||||
; CHECK-NEXT: st.b32 [%rd3], %r17;
|
||||
; CHECK-NEXT: ret;
|
||||
entry:
|
||||
%t1 = load <4 x i8>, ptr %a, align 4
|
||||
|
||||
@@ -40,7 +40,7 @@ define %struct.S16 @i32_to_2xi16(i32 noundef %in) {
|
||||
%low = trunc i32 %in to i16
|
||||
%high32 = lshr i32 %in, 16
|
||||
%high = trunc i32 %high32 to i16
|
||||
; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_param_0];
|
||||
; CHECK: ld.param.b32 %[[R32:r[0-9]+]], [i32_to_2xi16_param_0];
|
||||
; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]];
|
||||
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]];
|
||||
%s1 = insertvalue %struct.S16 poison, i16 %low, 0
|
||||
@@ -54,7 +54,7 @@ define %struct.S16 @i32_to_2xi16_lh(i32 noundef %in) {
|
||||
%high32 = lshr i32 %in, 16
|
||||
%high = trunc i32 %high32 to i16
|
||||
%low = trunc i32 %in to i16
|
||||
; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0];
|
||||
; CHECK: ld.param.b32 %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0];
|
||||
; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]];
|
||||
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]];
|
||||
%s1 = insertvalue %struct.S16 poison, i16 %low, 0
|
||||
@@ -82,7 +82,7 @@ define %struct.S32 @i64_to_2xi32(i64 noundef %in) {
|
||||
%low = trunc i64 %in to i32
|
||||
%high64 = lshr i64 %in, 32
|
||||
%high = trunc i64 %high64 to i32
|
||||
; CHECK: ld.param.u64 %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0];
|
||||
; CHECK: ld.param.b64 %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0];
|
||||
; CHECK-DAG: cvt.u32.u64 %r{{[0-9+]}}, %[[R64]];
|
||||
; CHECK-DAG mov.b64 {tmp, %r{{[0-9+]}}}, %[[R64]];
|
||||
%s1 = insertvalue %struct.S32 poison, i32 %low, 0
|
||||
@@ -112,7 +112,7 @@ define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){
|
||||
%l = trunc i32 %i1 to i16
|
||||
%h32 = ashr i32 %i1, 16
|
||||
%h = trunc i32 %h32 to i16
|
||||
; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0];
|
||||
; CHECK: ld.param.b32 %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0];
|
||||
; CHECK: shr.s32 %[[R32H:r[0-9]+]], %[[R32]], 16;
|
||||
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]];
|
||||
; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32H]];
|
||||
|
||||
@@ -22,9 +22,9 @@ define internal i32 @foo() {
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov.b64 %SPL, __local_depot0;
|
||||
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
|
||||
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
|
||||
; CHECK-NEXT: ld.global.b64 %rd1, [ptr];
|
||||
; CHECK-NEXT: add.u64 %rd3, %SPL, 1;
|
||||
; CHECK-NEXT: ld.local.u8 %rs1, [%rd3];
|
||||
; CHECK-NEXT: ld.local.b8 %rs1, [%rd3];
|
||||
; CHECK-NEXT: add.u64 %rd4, %SP, 0;
|
||||
; CHECK-NEXT: { // callseq 0, 0
|
||||
; CHECK-NEXT: .param .align 1 .b8 param0[1];
|
||||
@@ -65,9 +65,9 @@ define internal i32 @bar() {
|
||||
; CHECK-NEXT: // %bb.0: // %entry
|
||||
; CHECK-NEXT: mov.b64 %SPL, __local_depot1;
|
||||
; CHECK-NEXT: cvta.local.u64 %SP, %SPL;
|
||||
; CHECK-NEXT: ld.global.u64 %rd1, [ptr];
|
||||
; CHECK-NEXT: ld.global.b64 %rd1, [ptr];
|
||||
; CHECK-NEXT: add.u64 %rd3, %SPL, 8;
|
||||
; CHECK-NEXT: ld.local.u64 %rd4, [%rd3];
|
||||
; CHECK-NEXT: ld.local.b64 %rd4, [%rd3];
|
||||
; CHECK-NEXT: add.u64 %rd5, %SP, 0;
|
||||
; CHECK-NEXT: { // callseq 1, 0
|
||||
; CHECK-NEXT: .param .align 8 .b8 param0[8];
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user