From 369891b6747e4ad4b5e4e6d06f3f7596f3ee3f02 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Sat, 10 May 2025 08:26:26 -0700 Subject: [PATCH] [NVPTX] use untyped loads and stores where ever possible (#137698) In most cases, the type information attached to load and store instructions is meaningless and inconsistently applied. We can usually use ".b" loads and avoid the complexity of trying to assign the correct type. The one expectation is sign-extending load, which will continue to use ".s" to ensure the sign extension into a larger register is done correctly. --- clang/test/CodeGenCUDA/bf16.cu | 2 +- clang/test/CodeGenCUDA/fp-contract.cu | 28 +- clang/test/CodeGenCUDA/memcpy-libcall.cu | 28 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 67 +- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 30 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 64 +- .../test/CodeGen/NVPTX/LoadStoreVectorizer.ll | 174 +- llvm/test/CodeGen/NVPTX/MachineSink-call.ll | 2 +- .../CodeGen/NVPTX/MachineSink-convergent.ll | 2 +- llvm/test/CodeGen/NVPTX/access-non-generic.ll | 24 +- llvm/test/CodeGen/NVPTX/addr-mode.ll | 20 +- .../CodeGen/NVPTX/addrspacecast-folding.ll | 4 +- .../test/CodeGen/NVPTX/addrspacecast-ptx64.ll | 32 +- llvm/test/CodeGen/NVPTX/addrspacecast.ll | 38 +- llvm/test/CodeGen/NVPTX/aggregate-return.ll | 30 +- llvm/test/CodeGen/NVPTX/and-or-setcc.ll | 8 +- llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll | 4 +- llvm/test/CodeGen/NVPTX/applypriority.ll | 4 +- llvm/test/CodeGen/NVPTX/atomics-sm70.ll | 26 +- llvm/test/CodeGen/NVPTX/atomics-sm90.ll | 26 +- llvm/test/CodeGen/NVPTX/atomics.ll | 132 +- llvm/test/CodeGen/NVPTX/barrier.ll | 4 +- llvm/test/CodeGen/NVPTX/bf16-instructions.ll | 114 +- llvm/test/CodeGen/NVPTX/bf16.ll | 4 +- .../test/CodeGen/NVPTX/bf16x2-instructions.ll | 40 +- llvm/test/CodeGen/NVPTX/bfe.ll | 28 +- llvm/test/CodeGen/NVPTX/bmsk.ll | 12 +- llvm/test/CodeGen/NVPTX/bswap.ll | 8 +- llvm/test/CodeGen/NVPTX/bug21465.ll | 6 +- llvm/test/CodeGen/NVPTX/bug22246.ll | 10 +- llvm/test/CodeGen/NVPTX/bug26185-2.ll | 2 +- llvm/test/CodeGen/NVPTX/bug26185.ll | 8 +- llvm/test/CodeGen/NVPTX/byval-const-global.ll | 4 +- .../CodeGen/NVPTX/call-with-alloca-buffer.ll | 6 +- llvm/test/CodeGen/NVPTX/chain-different-as.ll | 4 +- llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll | 1260 +++++------ llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll | 1260 +++++------ llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll | 1260 +++++------ llvm/test/CodeGen/NVPTX/cmpxchg.ll | 420 ++-- llvm/test/CodeGen/NVPTX/combine-mad.ll | 56 +- llvm/test/CodeGen/NVPTX/convert-fp-i8.ll | 20 +- llvm/test/CodeGen/NVPTX/convert-int-sm20.ll | 12 +- llvm/test/CodeGen/NVPTX/convert-sm100.ll | 8 +- llvm/test/CodeGen/NVPTX/convert-sm100a.ll | 42 +- llvm/test/CodeGen/NVPTX/convert-sm80.ll | 50 +- llvm/test/CodeGen/NVPTX/convert-sm90.ll | 8 +- llvm/test/CodeGen/NVPTX/copysign.ll | 36 +- .../CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll | 292 +-- .../NVPTX/cp-async-bulk-tensor-prefetch.ll | 62 +- .../NVPTX/cp-async-bulk-tensor-reduce.ll | 102 +- .../CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll | 120 +- llvm/test/CodeGen/NVPTX/cp-async-bulk.ll | 62 +- llvm/test/CodeGen/NVPTX/ctlz.ll | 28 +- llvm/test/CodeGen/NVPTX/dag-cse.ll | 8 +- llvm/test/CodeGen/NVPTX/demote-vars.ll | 4 +- llvm/test/CodeGen/NVPTX/discard.ll | 4 +- llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll | 2 +- .../NVPTX/distributed-shared-cluster.ll | 26 +- llvm/test/CodeGen/NVPTX/div.ll | 6 +- llvm/test/CodeGen/NVPTX/dot-product.ll | 74 +- .../NVPTX/dynamic-stackalloc-regression.ll | 10 +- llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll | 4 +- llvm/test/CodeGen/NVPTX/elect.ll | 4 +- llvm/test/CodeGen/NVPTX/extloadv.ll | 2 +- llvm/test/CodeGen/NVPTX/extractelement.ll | 14 +- llvm/test/CodeGen/NVPTX/f16-instructions.ll | 64 +- llvm/test/CodeGen/NVPTX/f16x2-instructions.ll | 90 +- llvm/test/CodeGen/NVPTX/f32-ex2.ll | 8 +- llvm/test/CodeGen/NVPTX/f32-lg2.ll | 8 +- llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll | 12 +- llvm/test/CodeGen/NVPTX/fexp2.ll | 40 +- llvm/test/CodeGen/NVPTX/flo.ll | 16 +- llvm/test/CodeGen/NVPTX/flog2.ll | 16 +- llvm/test/CodeGen/NVPTX/fma-relu-contract.ll | 24 +- .../CodeGen/NVPTX/fma-relu-fma-intrinsic.ll | 18 +- .../NVPTX/fma-relu-instruction-flag.ll | 36 +- llvm/test/CodeGen/NVPTX/fns.ll | 6 +- llvm/test/CodeGen/NVPTX/forward-ld-param.ll | 18 +- llvm/test/CodeGen/NVPTX/fp-contract.ll | 42 +- llvm/test/CodeGen/NVPTX/fp128-storage-type.ll | 12 +- llvm/test/CodeGen/NVPTX/frem.ll | 64 +- llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll | 20 +- llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll | 6 +- llvm/test/CodeGen/NVPTX/globals_lowering.ll | 4 +- llvm/test/CodeGen/NVPTX/half.ll | 4 +- llvm/test/CodeGen/NVPTX/i1-ext-load.ll | 8 +- llvm/test/CodeGen/NVPTX/i1-icmp.ll | 40 +- llvm/test/CodeGen/NVPTX/i1-load-lower.ll | 4 +- llvm/test/CodeGen/NVPTX/i1-select.ll | 40 +- llvm/test/CodeGen/NVPTX/i128-array.ll | 12 +- llvm/test/CodeGen/NVPTX/i128-ld-st.ll | 8 +- llvm/test/CodeGen/NVPTX/i128-param.ll | 12 +- llvm/test/CodeGen/NVPTX/i128-retval.ll | 8 +- llvm/test/CodeGen/NVPTX/i128.ll | 28 +- llvm/test/CodeGen/NVPTX/i16x2-instructions.ll | 204 +- llvm/test/CodeGen/NVPTX/i8-param.ll | 4 +- llvm/test/CodeGen/NVPTX/i8x2-instructions.ll | 4 +- llvm/test/CodeGen/NVPTX/i8x4-instructions.ll | 258 +-- llvm/test/CodeGen/NVPTX/idioms.ll | 8 +- llvm/test/CodeGen/NVPTX/indirect_byval.ll | 8 +- .../CodeGen/NVPTX/inline-asm-b128-test1.ll | 16 +- .../CodeGen/NVPTX/inline-asm-b128-test2.ll | 8 +- .../CodeGen/NVPTX/inline-asm-b128-test3.ll | 4 +- llvm/test/CodeGen/NVPTX/intrinsics.ll | 40 +- llvm/test/CodeGen/NVPTX/jump-table.ll | 12 +- llvm/test/CodeGen/NVPTX/ld-addrspace.ll | 72 +- llvm/test/CodeGen/NVPTX/ld-generic.ll | 24 +- llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py | 17 +- llvm/test/CodeGen/NVPTX/ldg-invariant.ll | 40 +- llvm/test/CodeGen/NVPTX/ldparam-v4.ll | 6 +- llvm/test/CodeGen/NVPTX/ldu-i8.ll | 2 +- llvm/test/CodeGen/NVPTX/ldu-ldg.ll | 84 +- .../test/CodeGen/NVPTX/ldu-reg-plus-offset.ll | 4 +- llvm/test/CodeGen/NVPTX/load-sext-i1.ll | 4 +- llvm/test/CodeGen/NVPTX/load-store-scalars.ll | 1152 +++++----- llvm/test/CodeGen/NVPTX/load-store-sm-70.ll | 1920 ++++++++--------- llvm/test/CodeGen/NVPTX/load-store-sm-90.ll | 768 +++---- llvm/test/CodeGen/NVPTX/load-store-vectors.ll | 528 ++--- .../NVPTX/load-with-non-coherent-cache.ll | 88 +- llvm/test/CodeGen/NVPTX/local-stack-frame.ll | 32 +- llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll | 22 +- llvm/test/CodeGen/NVPTX/lower-alloca.ll | 4 +- .../CodeGen/NVPTX/lower-args-gridconstant.ll | 80 +- llvm/test/CodeGen/NVPTX/lower-args.ll | 58 +- llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 308 +-- .../CodeGen/NVPTX/lower-kernel-ptr-arg.ll | 24 +- llvm/test/CodeGen/NVPTX/machine-sink.ll | 4 +- llvm/test/CodeGen/NVPTX/match.ll | 16 +- llvm/test/CodeGen/NVPTX/math-intrins.ll | 304 +-- .../CodeGen/NVPTX/misaligned-vector-ldst.ll | 80 +- llvm/test/CodeGen/NVPTX/misched_func_call.ll | 12 +- llvm/test/CodeGen/NVPTX/mulhi-intrins.ll | 24 +- llvm/test/CodeGen/NVPTX/nounroll.ll | 16 +- .../CodeGen/NVPTX/nvvm-reflect-arch-O0.ll | 6 +- llvm/test/CodeGen/NVPTX/param-add.ll | 8 +- llvm/test/CodeGen/NVPTX/param-align.ll | 30 +- llvm/test/CodeGen/NVPTX/param-load-store.ll | 236 +- llvm/test/CodeGen/NVPTX/param-overalign.ll | 28 +- .../CodeGen/NVPTX/param-vectorize-device.ll | 50 +- .../CodeGen/NVPTX/param-vectorize-kernel.ll | 208 +- llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll | 8 +- llvm/test/CodeGen/NVPTX/pr16278.ll | 2 +- llvm/test/CodeGen/NVPTX/prefetch.ll | 8 +- .../CodeGen/NVPTX/proxy-reg-erasure-ptx.ll | 18 +- llvm/test/CodeGen/NVPTX/rcp-opt.ll | 12 +- .../NVPTX/read-global-variable-constant.ll | 6 +- .../CodeGen/NVPTX/reduction-intrinsics.ll | 356 +-- llvm/test/CodeGen/NVPTX/redux-sync-f32.ll | 48 +- llvm/test/CodeGen/NVPTX/reg-types.ll | 20 +- llvm/test/CodeGen/NVPTX/rotate-add.ll | 40 +- llvm/test/CodeGen/NVPTX/rotate.ll | 164 +- llvm/test/CodeGen/NVPTX/rotate_64.ll | 4 +- llvm/test/CodeGen/NVPTX/sad-intrins.ll | 36 +- llvm/test/CodeGen/NVPTX/sched1.ll | 8 +- llvm/test/CodeGen/NVPTX/sched2.ll | 8 +- llvm/test/CodeGen/NVPTX/sext-params.ll | 2 +- llvm/test/CodeGen/NVPTX/sext-setcc.ll | 8 +- llvm/test/CodeGen/NVPTX/shfl-p.ll | 64 +- llvm/test/CodeGen/NVPTX/shfl-sync-p.ll | 80 +- llvm/test/CodeGen/NVPTX/shfl-sync.ll | 40 +- llvm/test/CodeGen/NVPTX/shfl.ll | 18 +- llvm/test/CodeGen/NVPTX/short-ptr.ll | 6 +- .../CodeGen/NVPTX/shuffle-vec-undef-init.ll | 6 +- llvm/test/CodeGen/NVPTX/st-addrspace.ll | 72 +- llvm/test/CodeGen/NVPTX/st-generic.ll | 24 +- llvm/test/CodeGen/NVPTX/st-param-imm.ll | 294 +-- llvm/test/CodeGen/NVPTX/st_bulk.ll | 12 +- llvm/test/CodeGen/NVPTX/stacksaverestore.ll | 6 +- llvm/test/CodeGen/NVPTX/store-retval.ll | 6 +- llvm/test/CodeGen/NVPTX/store-undef.ll | 56 +- llvm/test/CodeGen/NVPTX/surf-read-cuda.ll | 14 +- llvm/test/CodeGen/NVPTX/surf-read.ll | 2 +- llvm/test/CodeGen/NVPTX/surf-write-cuda.ll | 10 +- llvm/test/CodeGen/NVPTX/szext.ll | 20 +- .../test/CodeGen/NVPTX/tag-invariant-loads.ll | 38 +- llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll | 24 +- llvm/test/CodeGen/NVPTX/tcgen05-commit.ll | 24 +- llvm/test/CodeGen/NVPTX/tcgen05-cp.ll | 72 +- llvm/test/CodeGen/NVPTX/tcgen05-ld.ll | 20 +- llvm/test/CodeGen/NVPTX/tcgen05-shift.ll | 2 +- llvm/test/CodeGen/NVPTX/tcgen05-st.ll | 1308 +++++------ llvm/test/CodeGen/NVPTX/tex-read-cuda.ll | 22 +- llvm/test/CodeGen/NVPTX/tex-read.ll | 2 +- llvm/test/CodeGen/NVPTX/texsurf-queries.ll | 8 +- .../NVPTX/unaligned-param-load-store.ll | 86 +- ...unfold-masked-merge-vector-variablemask.ll | 174 +- llvm/test/CodeGen/NVPTX/vaargs.ll | 42 +- llvm/test/CodeGen/NVPTX/variadics-backend.ll | 92 +- llvm/test/CodeGen/NVPTX/vec-param-load.ll | 52 +- llvm/test/CodeGen/NVPTX/vec8.ll | 6 +- llvm/test/CodeGen/NVPTX/vector-args.ll | 8 +- llvm/test/CodeGen/NVPTX/vector-call.ll | 6 +- llvm/test/CodeGen/NVPTX/vector-compare.ll | 8 +- llvm/test/CodeGen/NVPTX/vector-loads.ll | 34 +- llvm/test/CodeGen/NVPTX/vector-select.ll | 8 +- llvm/test/CodeGen/NVPTX/vector-stores.ll | 8 +- .../CodeGen/NVPTX/vectorize-misaligned.ll | 8 +- llvm/test/DebugInfo/NVPTX/debug-info.ll | 14 +- .../NaryReassociate/NVPTX/nary-slsr.ll | 2 +- .../NVPTX/split-gep-and-gvn.ll | 36 +- .../NVPTX/reassociate-geps-and-slsr.ll | 12 +- .../NVPTX/speculative-slsr.ll | 4 +- .../Inputs/nvptx-basic.ll.expected | 22 +- 203 files changed, 8668 insertions(+), 8696 deletions(-) diff --git a/clang/test/CodeGenCUDA/bf16.cu b/clang/test/CodeGenCUDA/bf16.cu index f794b83239f1..df56ec60c63a 100644 --- a/clang/test/CodeGenCUDA/bf16.cu +++ b/clang/test/CodeGenCUDA/bf16.cu @@ -11,7 +11,7 @@ // CHECK: .param .align 2 .b8 _Z8test_argPDF16bDF16b_param_1[2] // __device__ void test_arg(__bf16 *out, __bf16 in) { -// CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0]; +// CHECK-DAG: ld.param.b64 %[[A:rd[0-9]+]], [_Z8test_argPDF16bDF16b_param_0]; // CHECK-DAG: ld.param.b16 %[[R:rs[0-9]+]], [_Z8test_argPDF16bDF16b_param_1]; __bf16 bf16 = in; *out = bf16; diff --git a/clang/test/CodeGenCUDA/fp-contract.cu b/clang/test/CodeGenCUDA/fp-contract.cu index 60824ba59ddf..d6c796a817cb 100644 --- a/clang/test/CodeGenCUDA/fp-contract.cu +++ b/clang/test/CodeGenCUDA/fp-contract.cu @@ -179,26 +179,26 @@ __host__ __device__ float func(float a, float b, float c) { return a + b * c; } // COMMON-LABEL: _Z4funcfff // NV-ON: fma.rn.f32 -// NV-ON-NEXT: st.param.f32 +// NV-ON-NEXT: st.param.b32 // AMD-ON: v_fmac_f32_e64 // AMD-ON-NEXT: s_setpc_b64 // NV-OFF: mul.rn.f32 // NV-OFF-NEXT: add.rn.f32 -// NV-OFF-NEXT: st.param.f32 +// NV-OFF-NEXT: st.param.b32 // AMD-OFF: v_mul_f32_e64 // AMD-OFF-NEXT: v_add_f32_e64 // AMD-OFF-NEXT: s_setpc_b64 // NV-OPT-FAST: fma.rn.f32 -// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FAST-NEXT: st.param.b32 // NV-OPT-FASTSTD: fma.rn.f32 -// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-FASTSTD-NEXT: st.param.b32 // NV-OPT-ON: fma.rn.f32 -// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-ON-NEXT: st.param.b32 // NV-OPT-OFF: mul.rn.f32 // NV-OPT-OFF-NEXT: add.rn.f32 -// NV-OPT-OFF-NEXT: st.param.f32 +// NV-OPT-OFF-NEXT: st.param.b32 // AMD-OPT-FAST-IR: fmul contract float // AMD-OPT-FAST-IR: fadd contract float @@ -224,15 +224,15 @@ __host__ __device__ float func2(float a, float b, float c) { } // COMMON-LABEL: _Z5func2fff // NV-OPT-FAST: fma.rn.f32 -// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FAST-NEXT: st.param.b32 // NV-OPT-FASTSTD: fma.rn.f32 -// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-FASTSTD-NEXT: st.param.b32 // NV-OPT-ON: mul.rn.f32 // NV-OPT-ON: add.rn.f32 -// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-ON-NEXT: st.param.b32 // NV-OPT-OFF: mul.rn.f32 // NV-OPT-OFF: add.rn.f32 -// NV-OPT-OFF-NEXT: st.param.f32 +// NV-OPT-OFF-NEXT: st.param.b32 // AMD-OPT-FAST-IR: fmul contract float // AMD-OPT-FAST-IR: fadd contract float @@ -267,16 +267,16 @@ __host__ __device__ float func2(float a, float b, float c) { } // COMMON-LABEL: _Z5func3fff // NV-OPT-FAST: fma.rn.f32 -// NV-OPT-FAST-NEXT: st.param.f32 +// NV-OPT-FAST-NEXT: st.param.b32 // NV-OPT-FASTSTD: mul.rn.f32 // NV-OPT-FASTSTD: add.rn.f32 -// NV-OPT-FASTSTD-NEXT: st.param.f32 +// NV-OPT-FASTSTD-NEXT: st.param.b32 // NV-OPT-ON: mul.rn.f32 // NV-OPT-ON: add.rn.f32 -// NV-OPT-ON-NEXT: st.param.f32 +// NV-OPT-ON-NEXT: st.param.b32 // NV-OPT-OFF: mul.rn.f32 // NV-OPT-OFF: add.rn.f32 -// NV-OPT-OFF-NEXT: st.param.f32 +// NV-OPT-OFF-NEXT: st.param.b32 // AMD-OPT-FAST-IR: fmul float // AMD-OPT-FAST-IR: fadd float diff --git a/clang/test/CodeGenCUDA/memcpy-libcall.cu b/clang/test/CodeGenCUDA/memcpy-libcall.cu index 1180767545b1..c20fa2faceb0 100644 --- a/clang/test/CodeGenCUDA/memcpy-libcall.cu +++ b/clang/test/CodeGenCUDA/memcpy-libcall.cu @@ -10,15 +10,15 @@ // PTX-LABEL: .func _Z12copy_genericPvPKv( void __device__ copy_generic(void *dest, const void *src) { __builtin_memcpy(dest, src, 32); -// PTX: ld.u8 -// PTX: st.u8 +// PTX: ld.b8 +// PTX: st.b8 } // PTX-LABEL: .entry _Z11copy_globalPvS_( void __global__ copy_global(void *dest, void * src) { __builtin_memcpy(dest, src, 32); -// PTX: ld.global.u8 -// PTX: st.global.u8 +// PTX: ld.global.b8 +// PTX: st.global.b8 } struct S { @@ -28,24 +28,24 @@ struct S { // PTX-LABEL: .entry _Z20copy_param_to_globalP1SS_( void __global__ copy_param_to_global(S *global, S param) { __builtin_memcpy(global, ¶m, sizeof(S)); -// PTX: ld.param.u32 -// PTX: st.global.u32 +// PTX: ld.param.b32 +// PTX: st.global.b32 } // PTX-LABEL: .entry _Z19copy_param_to_localPU3AS51SS_( void __global__ copy_param_to_local(__attribute__((address_space(5))) S *local, S param) { __builtin_memcpy(local, ¶m, sizeof(S)); -// PTX: ld.param.u32 -// PTX: st.local.u32 +// PTX: ld.param.b32 +// PTX: st.local.b32 } // PTX-LABEL: .func _Z21copy_local_to_genericP1SPU3AS5S_( void __device__ copy_local_to_generic(S *generic, __attribute__((address_space(5))) S *src) { __builtin_memcpy(generic, src, sizeof(S)); -// PTX: ld.local.u32 -// PTX: st.u32 +// PTX: ld.local.b32 +// PTX: st.b32 } __shared__ S shared; @@ -53,12 +53,12 @@ __shared__ S shared; // PTX-LABEL: .entry _Z20copy_param_to_shared1S( void __global__ copy_param_to_shared( S param) { __builtin_memcpy(&shared, ¶m, sizeof(S)); -// PTX: ld.param.u32 -// PTX: st.shared.u32 +// PTX: ld.param.b32 +// PTX: st.shared.b32 } void __device__ copy_shared_to_generic(S *generic) { __builtin_memcpy(generic, &shared, sizeof(S)); -// PTX: ld.shared.u32 -// PTX: st.u32 +// PTX: ld.shared.b32 +// PTX: st.b32 } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 6f6084b99dda..57971313ba42 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1044,21 +1044,6 @@ pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, } } -static int getLdStRegType(EVT VT) { - if (VT.isFloatingPoint()) - switch (VT.getSimpleVT().SimpleTy) { - case MVT::f16: - case MVT::bf16: - case MVT::v2f16: - case MVT::v2bf16: - return NVPTX::PTXLdStInstCode::Untyped; - default: - return NVPTX::PTXLdStInstCode::Float; - } - else - return NVPTX::PTXLdStInstCode::Unsigned; -} - bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { MemSDNode *LD = cast(N); assert(LD->readMem() && "Expected load"); @@ -1088,24 +1073,14 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { // type is integer // Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float MVT SimpleVT = LoadedVT.getSimpleVT(); - MVT ScalarVT = SimpleVT.getScalarType(); // Read at least 8 bits (predicates are stored as 8-bit values) - unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits()); - unsigned int FromType; + unsigned FromTypeWidth = std::max(8U, (unsigned)SimpleVT.getSizeInBits()); // Vector Setting - unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; - if (SimpleVT.isVector()) { - assert((Isv2x16VT(LoadedVT) || LoadedVT == MVT::v4i8) && - "Unexpected vector type"); - // v2f16/v2bf16/v2i16 is loaded using ld.b32 - FromTypeWidth = 32; - } - - if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) - FromType = NVPTX::PTXLdStInstCode::Signed; - else - FromType = getLdStRegType(ScalarVT); + unsigned int FromType = + (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD)) + ? NVPTX::PTXLdStInstCode::Signed + : NVPTX::PTXLdStInstCode::Untyped; assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && FromTypeWidth <= 128 && "Invalid width for load"); @@ -1116,7 +1091,7 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDValue Ops[] = {getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), + getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base, @@ -1182,7 +1157,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { unsigned ExtensionType = N->getConstantOperandVal(N->getNumOperands() - 1); unsigned FromType = (ExtensionType == ISD::SEXTLOAD) ? NVPTX::PTXLdStInstCode::Signed - : getLdStRegType(MemVT.getScalarType()); + : NVPTX::PTXLdStInstCode::Untyped; unsigned VecType; unsigned FromTypeWidth; @@ -1200,8 +1175,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } if (isSubVectorPackedInI32(EltVT)) { + assert(ExtensionType == ISD::NON_EXTLOAD); EltVT = MVT::i32; - FromType = NVPTX::PTXLdStInstCode::Untyped; } assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && @@ -1405,21 +1380,7 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { auto [Ordering, Scope] = insertMemoryInstructionFence(DL, Chain, ST); // Vector Setting - MVT SimpleVT = StoreVT.getSimpleVT(); - unsigned VecType = NVPTX::PTXLdStInstCode::Scalar; - - // Type Setting: toType + toTypeWidth - // - for integer type, always use 'u' - MVT ScalarVT = SimpleVT.getScalarType(); - unsigned ToTypeWidth = ScalarVT.getSizeInBits(); - if (SimpleVT.isVector()) { - assert((Isv2x16VT(StoreVT) || StoreVT == MVT::v4i8) && - "Unexpected vector type"); - // v2x16 is stored using st.b32 - ToTypeWidth = 32; - } - - unsigned int ToType = getLdStRegType(ScalarVT); + const unsigned ToTypeWidth = StoreVT.getSimpleVT().getSizeInBits(); // Create the machine instruction DAG SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal(); @@ -1434,8 +1395,8 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), - getI32Imm(ToType, DL), + getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL), + getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL), getI32Imm(ToTypeWidth, DL), Base, Offset, @@ -1481,7 +1442,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { // Type Setting: toType + toTypeWidth // - for integer type, always use 'u' const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits(); - unsigned ToType = getLdStRegType(StoreVT.getSimpleVT().getScalarType()); SmallVector Ops; SDValue N2; @@ -1508,7 +1468,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { if (isSubVectorPackedInI32(EltVT)) { EltVT = MVT::i32; - ToType = NVPTX::PTXLdStInstCode::Untyped; } assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && @@ -1519,8 +1478,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), - getI32Imm(ToType, DL), getI32Imm(ToTypeWidth, DL), Base, Offset, - Chain}); + getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL), + getI32Imm(ToTypeWidth, DL), Base, Offset, Chain}); std::optional Opcode; switch (N->getOpcode()) { diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index a384cb79d645..6639554e450f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2249,11 +2249,11 @@ def LoadParamMemV2I8 : LoadParamV2MemInst; def LoadParamMemV4I32 : LoadParamV4MemInst; def LoadParamMemV4I16 : LoadParamV4MemInst; def LoadParamMemV4I8 : LoadParamV4MemInst; -def LoadParamMemF32 : LoadParamMemInst; -def LoadParamMemF64 : LoadParamMemInst; -def LoadParamMemV2F32 : LoadParamV2MemInst; -def LoadParamMemV2F64 : LoadParamV2MemInst; -def LoadParamMemV4F32 : LoadParamV4MemInst; +def LoadParamMemF32 : LoadParamMemInst; +def LoadParamMemF64 : LoadParamMemInst; +def LoadParamMemV2F32 : LoadParamV2MemInst; +def LoadParamMemV2F64 : LoadParamV2MemInst; +def LoadParamMemV4F32 : LoadParamV4MemInst; defm StoreParamI64 : StoreParamInst; defm StoreParamI32 : StoreParamInst; @@ -2272,13 +2272,13 @@ defm StoreParamV4I32 : StoreParamV4Inst; defm StoreParamV4I16 : StoreParamV4Inst; defm StoreParamV4I8 : StoreParamV4Inst; -defm StoreParamF32 : StoreParamInst; -defm StoreParamF64 : StoreParamInst; +defm StoreParamF32 : StoreParamInst; +defm StoreParamF64 : StoreParamInst; -defm StoreParamV2F32 : StoreParamV2Inst; -defm StoreParamV2F64 : StoreParamV2Inst; +defm StoreParamV2F32 : StoreParamV2Inst; +defm StoreParamV2F64 : StoreParamV2Inst; -defm StoreParamV4F32 : StoreParamV4Inst; +defm StoreParamV4F32 : StoreParamV4Inst; def StoreRetvalI64 : StoreRetvalInst; def StoreRetvalI32 : StoreRetvalInst; @@ -2294,11 +2294,11 @@ def StoreRetvalV4I32 : StoreRetvalV4Inst; def StoreRetvalV4I16 : StoreRetvalV4Inst; def StoreRetvalV4I8 : StoreRetvalV4Inst; -def StoreRetvalF64 : StoreRetvalInst; -def StoreRetvalF32 : StoreRetvalInst; -def StoreRetvalV2F64 : StoreRetvalV2Inst; -def StoreRetvalV2F32 : StoreRetvalV2Inst; -def StoreRetvalV4F32 : StoreRetvalV4Inst; +def StoreRetvalF64 : StoreRetvalInst; +def StoreRetvalF32 : StoreRetvalInst; +def StoreRetvalV2F64 : StoreRetvalV2Inst; +def StoreRetvalV2F32 : StoreRetvalV2Inst; +def StoreRetvalV4F32 : StoreRetvalV4Inst; def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>; def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index c339817a2d21..81a864b90c04 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2329,12 +2329,12 @@ class LDU_G "ldu.global." # TyStr # " \t$result, [$src];", []>, Requires<[hasLDU]>; -def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>; -def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>; -def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>; -def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>; -def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>; -def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>; +def INT_PTX_LDU_GLOBAL_i8 : LDU_G<"b8", Int16Regs>; +def INT_PTX_LDU_GLOBAL_i16 : LDU_G<"b16", Int16Regs>; +def INT_PTX_LDU_GLOBAL_i32 : LDU_G<"b32", Int32Regs>; +def INT_PTX_LDU_GLOBAL_i64 : LDU_G<"b64", Int64Regs>; +def INT_PTX_LDU_GLOBAL_f32 : LDU_G<"b32", Float32Regs>; +def INT_PTX_LDU_GLOBAL_f64 : LDU_G<"b64", Float64Regs>; // vector @@ -2351,19 +2351,19 @@ class VLDU_G_ELE_V4 "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; -def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>; -def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>; -def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>; -def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>; -def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>; -def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>; +def INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"b8", Int16Regs>; +def INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"b16", Int16Regs>; +def INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"b32", Int32Regs>; +def INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"b32", Float32Regs>; +def INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"b64", Int64Regs>; +def INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"b64", Float64Regs>; -def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>; -def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>; -def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>; +def INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"b8", Int16Regs>; +def INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>; +def INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>; def INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>; def INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>; -def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>; +def INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"b32", Float32Regs>; //----------------------------------- @@ -2379,12 +2379,12 @@ class LDG_G "ld.global.nc." # TyStr # " \t$result, [$src];", []>, Requires<[hasLDG]>; -def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>; -def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>; -def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>; -def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>; -def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>; -def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>; +def INT_PTX_LDG_GLOBAL_i8 : LDG_G<"b8", Int16Regs>; +def INT_PTX_LDG_GLOBAL_i16 : LDG_G<"b16", Int16Regs>; +def INT_PTX_LDG_GLOBAL_i32 : LDG_G<"b32", Int32Regs>; +def INT_PTX_LDG_GLOBAL_i64 : LDG_G<"b64", Int64Regs>; +def INT_PTX_LDG_GLOBAL_f32 : LDG_G<"b32", Float32Regs>; +def INT_PTX_LDG_GLOBAL_f64 : LDG_G<"b64", Float64Regs>; // vector @@ -2401,17 +2401,17 @@ class VLDG_G_ELE_V4 : "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. -def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>; -def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>; -def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>; -def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>; -def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>; -def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>; +def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>; +def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>; +def INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"b32", Int32Regs>; +def INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"b32", Float32Regs>; +def INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"b64", Int64Regs>; +def INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"b64", Float64Regs>; -def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>; -def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>; -def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>; -def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>; +def INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"b8", Int16Regs>; +def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>; +def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>; +def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>; multiclass NG_TO_G Preds = []> { diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index 8f0964c2d5eb..78b57badc06e 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -16,8 +16,8 @@ define i32 @f(ptr %p) { ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [f_param_0]; -; ENABLED-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; +; ENABLED-NEXT: ld.param.b64 %rd1, [f_param_0]; +; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; ; ENABLED-NEXT: add.s32 %r3, %r1, %r2; ; ENABLED-NEXT: st.param.b32 [func_retval0], %r3; ; ENABLED-NEXT: ret; @@ -28,9 +28,9 @@ define i32 @f(ptr %p) { ; DISABLED-NEXT: .reg .b64 %rd<2>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [f_param_0]; -; DISABLED-NEXT: ld.u32 %r1, [%rd1]; -; DISABLED-NEXT: ld.u32 %r2, [%rd1+4]; +; DISABLED-NEXT: ld.param.b64 %rd1, [f_param_0]; +; DISABLED-NEXT: ld.b32 %r1, [%rd1]; +; DISABLED-NEXT: ld.b32 %r2, [%rd1+4]; ; DISABLED-NEXT: add.s32 %r3, %r1, %r2; ; DISABLED-NEXT: st.param.b32 [func_retval0], %r3; ; DISABLED-NEXT: ret; @@ -49,7 +49,7 @@ define half @fh(ptr %p) { ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [fh_param_0]; +; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; ; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8]; ; ENABLED-NEXT: cvt.f32.f16 %f1, %rs2; @@ -78,7 +78,7 @@ define half @fh(ptr %p) { ; DISABLED-NEXT: .reg .b64 %rd<2>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [fh_param_0]; +; DISABLED-NEXT: ld.param.b64 %rd1, [fh_param_0]; ; DISABLED-NEXT: ld.b16 %rs1, [%rd1]; ; DISABLED-NEXT: ld.b16 %rs2, [%rd1+2]; ; DISABLED-NEXT: ld.b16 %rs3, [%rd1+4]; @@ -125,14 +125,14 @@ define float @ff(ptr %p) { ; ENABLED-NEXT: .reg .b64 %rd<2>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [ff_param_0]; -; ENABLED-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; ENABLED-NEXT: ld.f32 %f5, [%rd1+16]; +; ENABLED-NEXT: ld.param.b64 %rd1, [ff_param_0]; +; ENABLED-NEXT: ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; ENABLED-NEXT: ld.b32 %f5, [%rd1+16]; ; ENABLED-NEXT: add.rn.f32 %f6, %f1, %f2; ; ENABLED-NEXT: add.rn.f32 %f7, %f3, %f4; ; ENABLED-NEXT: add.rn.f32 %f8, %f6, %f7; ; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f5; -; ENABLED-NEXT: st.param.f32 [func_retval0], %f9; +; ENABLED-NEXT: st.param.b32 [func_retval0], %f9; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: ff( @@ -141,17 +141,17 @@ define float @ff(ptr %p) { ; DISABLED-NEXT: .reg .b64 %rd<2>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [ff_param_0]; -; DISABLED-NEXT: ld.f32 %f1, [%rd1]; -; DISABLED-NEXT: ld.f32 %f2, [%rd1+4]; -; DISABLED-NEXT: ld.f32 %f3, [%rd1+8]; -; DISABLED-NEXT: ld.f32 %f4, [%rd1+12]; -; DISABLED-NEXT: ld.f32 %f5, [%rd1+16]; +; DISABLED-NEXT: ld.param.b64 %rd1, [ff_param_0]; +; DISABLED-NEXT: ld.b32 %f1, [%rd1]; +; DISABLED-NEXT: ld.b32 %f2, [%rd1+4]; +; DISABLED-NEXT: ld.b32 %f3, [%rd1+8]; +; DISABLED-NEXT: ld.b32 %f4, [%rd1+12]; +; DISABLED-NEXT: ld.b32 %f5, [%rd1+16]; ; DISABLED-NEXT: add.rn.f32 %f6, %f1, %f2; ; DISABLED-NEXT: add.rn.f32 %f7, %f3, %f4; ; DISABLED-NEXT: add.rn.f32 %f8, %f6, %f7; ; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f5; -; DISABLED-NEXT: st.param.f32 [func_retval0], %f9; +; DISABLED-NEXT: st.param.b32 [func_retval0], %f9; ; DISABLED-NEXT: ret; %p.1 = getelementptr float, ptr %p, i32 1 %p.2 = getelementptr float, ptr %p, i32 2 @@ -176,9 +176,9 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: .reg .b64 %rd<3>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0]; +; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0]; ; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; -; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1]; +; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1]; ; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8; ; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8; ; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8; @@ -210,7 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: add.s32 %r33, %r32, %r18; ; ENABLED-NEXT: add.s32 %r34, %r33, %r19; ; ENABLED-NEXT: add.s32 %r35, %r34, %r20; -; ENABLED-NEXT: st.u32 [%rd2], %r35; +; ENABLED-NEXT: st.b32 [%rd2], %r35; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: combine_v16i8( @@ -219,24 +219,24 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; DISABLED-NEXT: .reg .b64 %rd<3>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0]; -; DISABLED-NEXT: ld.u8 %r1, [%rd1]; -; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1]; -; DISABLED-NEXT: ld.u8 %r2, [%rd1+1]; -; DISABLED-NEXT: ld.u8 %r3, [%rd1+2]; -; DISABLED-NEXT: ld.u8 %r4, [%rd1+3]; -; DISABLED-NEXT: ld.u8 %r5, [%rd1+4]; -; DISABLED-NEXT: ld.u8 %r6, [%rd1+5]; -; DISABLED-NEXT: ld.u8 %r7, [%rd1+6]; -; DISABLED-NEXT: ld.u8 %r8, [%rd1+7]; -; DISABLED-NEXT: ld.u8 %r9, [%rd1+8]; -; DISABLED-NEXT: ld.u8 %r10, [%rd1+9]; -; DISABLED-NEXT: ld.u8 %r11, [%rd1+10]; -; DISABLED-NEXT: ld.u8 %r12, [%rd1+11]; -; DISABLED-NEXT: ld.u8 %r13, [%rd1+12]; -; DISABLED-NEXT: ld.u8 %r14, [%rd1+13]; -; DISABLED-NEXT: ld.u8 %r15, [%rd1+14]; -; DISABLED-NEXT: ld.u8 %r16, [%rd1+15]; +; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_param_0]; +; DISABLED-NEXT: ld.b8 %r1, [%rd1]; +; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_param_1]; +; DISABLED-NEXT: ld.b8 %r2, [%rd1+1]; +; DISABLED-NEXT: ld.b8 %r3, [%rd1+2]; +; DISABLED-NEXT: ld.b8 %r4, [%rd1+3]; +; DISABLED-NEXT: ld.b8 %r5, [%rd1+4]; +; DISABLED-NEXT: ld.b8 %r6, [%rd1+5]; +; DISABLED-NEXT: ld.b8 %r7, [%rd1+6]; +; DISABLED-NEXT: ld.b8 %r8, [%rd1+7]; +; DISABLED-NEXT: ld.b8 %r9, [%rd1+8]; +; DISABLED-NEXT: ld.b8 %r10, [%rd1+9]; +; DISABLED-NEXT: ld.b8 %r11, [%rd1+10]; +; DISABLED-NEXT: ld.b8 %r12, [%rd1+11]; +; DISABLED-NEXT: ld.b8 %r13, [%rd1+12]; +; DISABLED-NEXT: ld.b8 %r14, [%rd1+13]; +; DISABLED-NEXT: ld.b8 %r15, [%rd1+14]; +; DISABLED-NEXT: ld.b8 %r16, [%rd1+15]; ; DISABLED-NEXT: add.s32 %r17, %r1, %r2; ; DISABLED-NEXT: add.s32 %r18, %r17, %r3; ; DISABLED-NEXT: add.s32 %r19, %r18, %r4; @@ -252,7 +252,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; DISABLED-NEXT: add.s32 %r29, %r28, %r14; ; DISABLED-NEXT: add.s32 %r30, %r29, %r15; ; DISABLED-NEXT: add.s32 %r31, %r30, %r16; -; DISABLED-NEXT: st.u32 [%rd2], %r31; +; DISABLED-NEXT: st.b32 [%rd2], %r31; ; DISABLED-NEXT: ret; %val0 = load i8, ptr %ptr1, align 16 %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1 @@ -327,9 +327,9 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig ; ENABLED-NEXT: .reg .b64 %rd<3>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0]; +; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0]; ; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; -; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1]; +; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1]; ; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8]; ; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8; ; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8; @@ -362,7 +362,7 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig ; ENABLED-NEXT: add.s32 %r33, %r32, %r18; ; ENABLED-NEXT: add.s32 %r34, %r33, %r19; ; ENABLED-NEXT: add.s32 %r35, %r34, %r20; -; ENABLED-NEXT: st.u32 [%rd2], %r35; +; ENABLED-NEXT: st.b32 [%rd2], %r35; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: combine_v16i8_unaligned( @@ -371,24 +371,24 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig ; DISABLED-NEXT: .reg .b64 %rd<3>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0]; -; DISABLED-NEXT: ld.u8 %r1, [%rd1]; -; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1]; -; DISABLED-NEXT: ld.u8 %r2, [%rd1+1]; -; DISABLED-NEXT: ld.u8 %r3, [%rd1+2]; -; DISABLED-NEXT: ld.u8 %r4, [%rd1+3]; -; DISABLED-NEXT: ld.u8 %r5, [%rd1+4]; -; DISABLED-NEXT: ld.u8 %r6, [%rd1+5]; -; DISABLED-NEXT: ld.u8 %r7, [%rd1+6]; -; DISABLED-NEXT: ld.u8 %r8, [%rd1+7]; -; DISABLED-NEXT: ld.u8 %r9, [%rd1+8]; -; DISABLED-NEXT: ld.u8 %r10, [%rd1+9]; -; DISABLED-NEXT: ld.u8 %r11, [%rd1+10]; -; DISABLED-NEXT: ld.u8 %r12, [%rd1+11]; -; DISABLED-NEXT: ld.u8 %r13, [%rd1+12]; -; DISABLED-NEXT: ld.u8 %r14, [%rd1+13]; -; DISABLED-NEXT: ld.u8 %r15, [%rd1+14]; -; DISABLED-NEXT: ld.u8 %r16, [%rd1+15]; +; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v16i8_unaligned_param_0]; +; DISABLED-NEXT: ld.b8 %r1, [%rd1]; +; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v16i8_unaligned_param_1]; +; DISABLED-NEXT: ld.b8 %r2, [%rd1+1]; +; DISABLED-NEXT: ld.b8 %r3, [%rd1+2]; +; DISABLED-NEXT: ld.b8 %r4, [%rd1+3]; +; DISABLED-NEXT: ld.b8 %r5, [%rd1+4]; +; DISABLED-NEXT: ld.b8 %r6, [%rd1+5]; +; DISABLED-NEXT: ld.b8 %r7, [%rd1+6]; +; DISABLED-NEXT: ld.b8 %r8, [%rd1+7]; +; DISABLED-NEXT: ld.b8 %r9, [%rd1+8]; +; DISABLED-NEXT: ld.b8 %r10, [%rd1+9]; +; DISABLED-NEXT: ld.b8 %r11, [%rd1+10]; +; DISABLED-NEXT: ld.b8 %r12, [%rd1+11]; +; DISABLED-NEXT: ld.b8 %r13, [%rd1+12]; +; DISABLED-NEXT: ld.b8 %r14, [%rd1+13]; +; DISABLED-NEXT: ld.b8 %r15, [%rd1+14]; +; DISABLED-NEXT: ld.b8 %r16, [%rd1+15]; ; DISABLED-NEXT: add.s32 %r17, %r1, %r2; ; DISABLED-NEXT: add.s32 %r18, %r17, %r3; ; DISABLED-NEXT: add.s32 %r19, %r18, %r4; @@ -404,7 +404,7 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig ; DISABLED-NEXT: add.s32 %r29, %r28, %r14; ; DISABLED-NEXT: add.s32 %r30, %r29, %r15; ; DISABLED-NEXT: add.s32 %r31, %r30, %r16; -; DISABLED-NEXT: st.u32 [%rd2], %r31; +; DISABLED-NEXT: st.b32 [%rd2], %r31; ; DISABLED-NEXT: ret; %val0 = load i8, ptr %ptr1, align 8 %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1 @@ -481,13 +481,13 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: .reg .b64 %rd<3>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0]; +; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v8i16_param_0]; ; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; ENABLED-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; ENABLED-NEXT: mov.b32 {%rs3, %rs4}, %r3; ; ENABLED-NEXT: mov.b32 {%rs5, %rs6}, %r2; ; ENABLED-NEXT: mov.b32 {%rs7, %rs8}, %r1; -; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1]; +; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v8i16_param_1]; ; ENABLED-NEXT: cvt.u32.u16 %r5, %rs7; ; ENABLED-NEXT: cvt.u32.u16 %r6, %rs8; ; ENABLED-NEXT: cvt.u32.u16 %r7, %rs5; @@ -503,7 +503,7 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: add.s32 %r17, %r16, %r10; ; ENABLED-NEXT: add.s32 %r18, %r17, %r11; ; ENABLED-NEXT: add.s32 %r19, %r18, %r12; -; ENABLED-NEXT: st.u32 [%rd2], %r19; +; ENABLED-NEXT: st.b32 [%rd2], %r19; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: combine_v8i16( @@ -512,16 +512,16 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; DISABLED-NEXT: .reg .b64 %rd<3>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0]; -; DISABLED-NEXT: ld.u16 %r1, [%rd1]; -; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1]; -; DISABLED-NEXT: ld.u16 %r2, [%rd1+2]; -; DISABLED-NEXT: ld.u16 %r3, [%rd1+4]; -; DISABLED-NEXT: ld.u16 %r4, [%rd1+6]; -; DISABLED-NEXT: ld.u16 %r5, [%rd1+8]; -; DISABLED-NEXT: ld.u16 %r6, [%rd1+10]; -; DISABLED-NEXT: ld.u16 %r7, [%rd1+12]; -; DISABLED-NEXT: ld.u16 %r8, [%rd1+14]; +; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v8i16_param_0]; +; DISABLED-NEXT: ld.b16 %r1, [%rd1]; +; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v8i16_param_1]; +; DISABLED-NEXT: ld.b16 %r2, [%rd1+2]; +; DISABLED-NEXT: ld.b16 %r3, [%rd1+4]; +; DISABLED-NEXT: ld.b16 %r4, [%rd1+6]; +; DISABLED-NEXT: ld.b16 %r5, [%rd1+8]; +; DISABLED-NEXT: ld.b16 %r6, [%rd1+10]; +; DISABLED-NEXT: ld.b16 %r7, [%rd1+12]; +; DISABLED-NEXT: ld.b16 %r8, [%rd1+14]; ; DISABLED-NEXT: add.s32 %r9, %r1, %r2; ; DISABLED-NEXT: add.s32 %r10, %r9, %r3; ; DISABLED-NEXT: add.s32 %r11, %r10, %r4; @@ -529,7 +529,7 @@ define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; DISABLED-NEXT: add.s32 %r13, %r12, %r6; ; DISABLED-NEXT: add.s32 %r14, %r13, %r7; ; DISABLED-NEXT: add.s32 %r15, %r14, %r8; -; DISABLED-NEXT: st.u32 [%rd2], %r15; +; DISABLED-NEXT: st.b32 [%rd2], %r15; ; DISABLED-NEXT: ret; %val0 = load i16, ptr %ptr1, align 16 %ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1 @@ -572,13 +572,13 @@ define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: .reg .b64 %rd<3>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: -; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v4i32_param_0]; -; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v4i32_param_1]; +; ENABLED-NEXT: ld.param.b64 %rd1, [combine_v4i32_param_0]; +; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; ENABLED-NEXT: ld.param.b64 %rd2, [combine_v4i32_param_1]; ; ENABLED-NEXT: add.s32 %r5, %r1, %r2; ; ENABLED-NEXT: add.s32 %r6, %r5, %r3; ; ENABLED-NEXT: add.s32 %r7, %r6, %r4; -; ENABLED-NEXT: st.u32 [%rd2], %r7; +; ENABLED-NEXT: st.b32 [%rd2], %r7; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: combine_v4i32( @@ -587,16 +587,16 @@ define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; DISABLED-NEXT: .reg .b64 %rd<3>; ; DISABLED-EMPTY: ; DISABLED-NEXT: // %bb.0: -; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v4i32_param_0]; -; DISABLED-NEXT: ld.u32 %r1, [%rd1]; -; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v4i32_param_1]; -; DISABLED-NEXT: ld.u32 %r2, [%rd1+4]; -; DISABLED-NEXT: ld.u32 %r3, [%rd1+8]; -; DISABLED-NEXT: ld.u32 %r4, [%rd1+12]; +; DISABLED-NEXT: ld.param.b64 %rd1, [combine_v4i32_param_0]; +; DISABLED-NEXT: ld.b32 %r1, [%rd1]; +; DISABLED-NEXT: ld.param.b64 %rd2, [combine_v4i32_param_1]; +; DISABLED-NEXT: ld.b32 %r2, [%rd1+4]; +; DISABLED-NEXT: ld.b32 %r3, [%rd1+8]; +; DISABLED-NEXT: ld.b32 %r4, [%rd1+12]; ; DISABLED-NEXT: add.s32 %r5, %r1, %r2; ; DISABLED-NEXT: add.s32 %r6, %r5, %r3; ; DISABLED-NEXT: add.s32 %r7, %r6, %r4; -; DISABLED-NEXT: st.u32 [%rd2], %r7; +; DISABLED-NEXT: st.b32 [%rd2], %r7; ; DISABLED-NEXT: ret; %val0 = load i32, ptr %ptr1, align 16 %ptr1.1 = getelementptr inbounds i32, ptr %ptr1, i64 1 diff --git a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll index ee2535f16fc8..aeb4a50e96f8 100644 --- a/llvm/test/CodeGen/NVPTX/MachineSink-call.ll +++ b/llvm/test/CodeGen/NVPTX/MachineSink-call.ll @@ -10,7 +10,7 @@ declare void @foo() ; the call may modify memory. define i32 @f(i32 %x, ptr %ptr, i1 %cond) { Start: - ; CHECK: ld.u32 + ; CHECK: ld.b32 %ptr_val = load i32, ptr %ptr ; CHECK: call.uni call void @foo() diff --git a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll index 222f147a7d46..43085cf718bf 100644 --- a/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll +++ b/llvm/test/CodeGen/NVPTX/MachineSink-convergent.ll @@ -10,7 +10,7 @@ declare void @llvm.nvvm.barrier0() ; syncthreads is modeled as maystore. define i32 @f(i32 %x, ptr %ptr, i1 %cond) { Start: - ; CHECK: ld.u32 + ; CHECK: ld.b32 %ptr_val = load i32, ptr %ptr ; CHECK: bar.sync call void @llvm.nvvm.barrier0() diff --git a/llvm/test/CodeGen/NVPTX/access-non-generic.ll b/llvm/test/CodeGen/NVPTX/access-non-generic.ll index 86d3f33a0421..a816f2e84b06 100644 --- a/llvm/test/CodeGen/NVPTX/access-non-generic.ll +++ b/llvm/test/CodeGen/NVPTX/access-non-generic.ll @@ -23,10 +23,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) { ; load cast %1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4 call void @use(float %1) -; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; +; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar]; ; store cast store float %v, ptr addrspacecast (ptr addrspace(3) @scalar to ptr), align 4 -; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; +; PTX: st.shared.b32 [scalar], %f{{[0-9]+}}; ; use syncthreads to disable optimizations across components call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; @@ -35,20 +35,20 @@ define void @ld_st_shared_f32(i32 %i, float %v) { %2 = addrspacecast ptr addrspace(3) @scalar to ptr %3 = load float, ptr %2, align 4 call void @use(float %3) -; PTX: ld.shared.f32 %f{{[0-9]+}}, [scalar]; +; PTX: ld.shared.b32 %f{{[0-9]+}}, [scalar]; ; cast; store store float %v, ptr %2, align 4 -; PTX: st.shared.f32 [scalar], %f{{[0-9]+}}; +; PTX: st.shared.b32 [scalar], %f{{[0-9]+}}; call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; ; load gep cast %4 = load float, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4 call void @use(float %4) -; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; +; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20]; ; store gep cast store float %v, ptr getelementptr inbounds ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5), align 4 -; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; +; PTX: st.shared.b32 [array+20], %f{{[0-9]+}}; call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; @@ -56,10 +56,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) { %5 = getelementptr inbounds [10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i32 0, i32 5 %6 = load float, ptr %5, align 4 call void @use(float %6) -; PTX: ld.shared.f32 %f{{[0-9]+}}, [array+20]; +; PTX: ld.shared.b32 %f{{[0-9]+}}, [array+20]; ; gep cast; store store float %v, ptr %5, align 4 -; PTX: st.shared.f32 [array+20], %f{{[0-9]+}}; +; PTX: st.shared.b32 [array+20], %f{{[0-9]+}}; call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; @@ -68,10 +68,10 @@ define void @ld_st_shared_f32(i32 %i, float %v) { %8 = getelementptr inbounds [10 x float], ptr %7, i32 0, i32 %i %9 = load float, ptr %8, align 4 call void @use(float %9) -; PTX: ld.shared.f32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}]; +; PTX: ld.shared.b32 %f{{[0-9]+}}, [%{{(r|rl|rd)[0-9]+}}]; ; cast; gep; store store float %v, ptr %8, align 4 -; PTX: st.shared.f32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}}; +; PTX: st.shared.b32 [%{{(r|rl|rd)[0-9]+}}], %f{{[0-9]+}}; call void @llvm.nvvm.barrier0() ; PTX: bar.sync 0; @@ -84,7 +84,7 @@ define i32 @ld_int_from_float() { ; IR-LABEL: @ld_int_from_float ; IR: load i32, ptr addrspace(3) @scalar ; PTX-LABEL: ld_int_from_float( -; PTX: ld.shared.u{{(32|64)}} +; PTX: ld.shared.b{{(32|64)}} %1 = load i32, ptr addrspacecast(ptr addrspace(3) @scalar to ptr), align 4 ret i32 %1 } @@ -108,7 +108,7 @@ define void @nested_const_expr() { ; store 1 to bitcast(gep(addrspacecast(array), 0, 1)) store i32 1, ptr getelementptr ([10 x float], ptr addrspacecast (ptr addrspace(3) @array to ptr), i64 0, i64 1), align 4 ; PTX: mov.b32 %r1, 1; -; PTX-NEXT: st.shared.u32 [array+4], %r1; +; PTX-NEXT: st.shared.b32 [array+4], %r1; ret void } diff --git a/llvm/test/CodeGen/NVPTX/addr-mode.ll b/llvm/test/CodeGen/NVPTX/addr-mode.ll index ab8fab6c8a3f..7b02872bfb61 100644 --- a/llvm/test/CodeGen/NVPTX/addr-mode.ll +++ b/llvm/test/CodeGen/NVPTX/addr-mode.ll @@ -10,8 +10,8 @@ define i32 @test_addr_mode_i64(ptr %x) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i64_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+-4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i64 -1 @@ -26,8 +26,8 @@ define i32 @test_addr_mode_i32(ptr %x) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i32_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i32_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+-4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i32 -1 @@ -42,8 +42,8 @@ define i32 @test_addr_mode_i16(ptr %x) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i16_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i16_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+-4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i16 -1 @@ -58,8 +58,8 @@ define i32 @test_addr_mode_i8(ptr %x) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1+-4]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1+-4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i8 -1 @@ -74,9 +74,9 @@ define i32 @test_addr_mode_i64_large(ptr %x) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_addr_mode_i64_large_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_addr_mode_i64_large_param_0]; ; CHECK-NEXT: add.s64 %rd2, %rd1, 17179869172; -; CHECK-NEXT: ld.u32 %r1, [%rd2]; +; CHECK-NEXT: ld.b32 %r1, [%rd2]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %addr = getelementptr i32, ptr %x, i64 4294967293 diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll index 87698c1c9644..b3e5cbe09a09 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast-folding.ll @@ -10,7 +10,7 @@ define ptr @test1(ptr %p) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test1_param_0]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd1; ; CHECK-NEXT: ret; %a = addrspacecast ptr %p to ptr addrspace(5) @@ -24,7 +24,7 @@ define ptr addrspace(1) @test2(ptr addrspace(5) %p) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test2_param_0]; ; CHECK-NEXT: cvta.local.u64 %rd2, %rd1; ; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll index 01326db9a8b1..00b17896d2c9 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast-ptx64.ll @@ -13,9 +13,9 @@ define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) { ; NOPTRCONV-NEXT: .reg .b64 %rd<3>; ; NOPTRCONV-EMPTY: ; NOPTRCONV-NEXT: // %bb.0: -; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_generic_param_0]; +; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_shared_cluster_to_generic_param_0]; ; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; -; NOPTRCONV-NEXT: ld.u32 %r1, [%rd2]; +; NOPTRCONV-NEXT: ld.b32 %r1, [%rd2]; ; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; ; NOPTRCONV-NEXT: ret; ; @@ -25,10 +25,10 @@ define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) { ; PTRCONV-NEXT: .reg .b64 %rd<3>; ; PTRCONV-EMPTY: ; PTRCONV-NEXT: // %bb.0: -; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_generic_param_0]; +; PTRCONV-NEXT: ld.param.b32 %r1, [conv_shared_cluster_to_generic_param_0]; ; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1; ; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; -; PTRCONV-NEXT: ld.u32 %r2, [%rd2]; +; PTRCONV-NEXT: ld.b32 %r2, [%rd2]; ; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2; ; PTRCONV-NEXT: ret; %genptr = addrspacecast ptr addrspace(7) %ptr to ptr @@ -45,9 +45,9 @@ define i32 @conv_generic_to_shared_cluster(ptr %ptr) { ; NOPTRCONV-NEXT: .reg .b64 %rd<3>; ; NOPTRCONV-EMPTY: ; NOPTRCONV-NEXT: // %bb.0: -; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0]; +; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_generic_to_shared_cluster_param_0]; ; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1; -; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd2]; +; NOPTRCONV-NEXT: ld.shared::cluster.b32 %r1, [%rd2]; ; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; ; NOPTRCONV-NEXT: ret; ; @@ -57,10 +57,10 @@ define i32 @conv_generic_to_shared_cluster(ptr %ptr) { ; PTRCONV-NEXT: .reg .b64 %rd<3>; ; PTRCONV-EMPTY: ; PTRCONV-NEXT: // %bb.0: -; PTRCONV-NEXT: ld.param.u64 %rd1, [conv_generic_to_shared_cluster_param_0]; +; PTRCONV-NEXT: ld.param.b64 %rd1, [conv_generic_to_shared_cluster_param_0]; ; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd2, %rd1; ; PTRCONV-NEXT: cvt.u32.u64 %r1, %rd2; -; PTRCONV-NEXT: ld.shared::cluster.u32 %r2, [%r1]; +; PTRCONV-NEXT: ld.shared::cluster.b32 %r2, [%r1]; ; PTRCONV-NEXT: st.param.b32 [func_retval0], %r2; ; PTRCONV-NEXT: ret; %specptr = addrspacecast ptr %ptr to ptr addrspace(7) @@ -76,10 +76,10 @@ define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) { ; NOPTRCONV-NEXT: .reg .b64 %rd<4>; ; NOPTRCONV-EMPTY: ; NOPTRCONV-NEXT: // %bb.0: -; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_to_shared_cluster_param_0]; +; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_shared_to_shared_cluster_param_0]; ; NOPTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1; ; NOPTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2; -; NOPTRCONV-NEXT: ld.shared::cluster.u32 %r1, [%rd3]; +; NOPTRCONV-NEXT: ld.shared::cluster.b32 %r1, [%rd3]; ; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; ; NOPTRCONV-NEXT: ret; ; @@ -89,12 +89,12 @@ define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) { ; PTRCONV-NEXT: .reg .b64 %rd<4>; ; PTRCONV-EMPTY: ; PTRCONV-NEXT: // %bb.0: -; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_to_shared_cluster_param_0]; +; PTRCONV-NEXT: ld.param.b32 %r1, [conv_shared_to_shared_cluster_param_0]; ; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1; ; PTRCONV-NEXT: cvta.shared.u64 %rd2, %rd1; ; PTRCONV-NEXT: cvta.to.shared::cluster.u64 %rd3, %rd2; ; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3; -; PTRCONV-NEXT: ld.shared::cluster.u32 %r3, [%r2]; +; PTRCONV-NEXT: ld.shared::cluster.b32 %r3, [%r2]; ; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3; ; PTRCONV-NEXT: ret; %specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7) @@ -110,10 +110,10 @@ define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) { ; NOPTRCONV-NEXT: .reg .b64 %rd<4>; ; NOPTRCONV-EMPTY: ; NOPTRCONV-NEXT: // %bb.0: -; NOPTRCONV-NEXT: ld.param.u64 %rd1, [conv_shared_cluster_to_shared_param_0]; +; NOPTRCONV-NEXT: ld.param.b64 %rd1, [conv_shared_cluster_to_shared_param_0]; ; NOPTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; ; NOPTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2; -; NOPTRCONV-NEXT: ld.shared.u32 %r1, [%rd3]; +; NOPTRCONV-NEXT: ld.shared.b32 %r1, [%rd3]; ; NOPTRCONV-NEXT: st.param.b32 [func_retval0], %r1; ; NOPTRCONV-NEXT: ret; ; @@ -123,12 +123,12 @@ define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) { ; PTRCONV-NEXT: .reg .b64 %rd<4>; ; PTRCONV-EMPTY: ; PTRCONV-NEXT: // %bb.0: -; PTRCONV-NEXT: ld.param.u32 %r1, [conv_shared_cluster_to_shared_param_0]; +; PTRCONV-NEXT: ld.param.b32 %r1, [conv_shared_cluster_to_shared_param_0]; ; PTRCONV-NEXT: cvt.u64.u32 %rd1, %r1; ; PTRCONV-NEXT: cvta.shared::cluster.u64 %rd2, %rd1; ; PTRCONV-NEXT: cvta.to.shared.u64 %rd3, %rd2; ; PTRCONV-NEXT: cvt.u32.u64 %r2, %rd3; -; PTRCONV-NEXT: ld.shared.u32 %r3, [%r2]; +; PTRCONV-NEXT: ld.shared.b32 %r3, [%r2]; ; PTRCONV-NEXT: st.param.b32 [func_retval0], %r3; ; PTRCONV-NEXT: ret; %specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3) diff --git a/llvm/test/CodeGen/NVPTX/addrspacecast.ll b/llvm/test/CodeGen/NVPTX/addrspacecast.ll index 0aa66d1fc45f..86008a1b7005 100644 --- a/llvm/test/CodeGen/NVPTX/addrspacecast.ll +++ b/llvm/test/CodeGen/NVPTX/addrspacecast.ll @@ -10,7 +10,7 @@ define i32 @conv1(ptr addrspace(1) %ptr) { ; CLS32: cvta.global.u32 ; ALL-NOT: cvt.u64.u32 ; CLS64: cvta.global.u64 -; ALL: ld.u32 +; ALL: ld.b32 %genptr = addrspacecast ptr addrspace(1) %ptr to ptr %val = load i32, ptr %genptr ret i32 %val @@ -22,7 +22,7 @@ define i32 @conv2(ptr addrspace(3) %ptr) { ; PTRCONV: cvt.u64.u32 ; NOPTRCONV-NOT: cvt.u64.u32 ; CLS64: cvta.shared.u64 -; ALL: ld.u32 +; ALL: ld.b32 %genptr = addrspacecast ptr addrspace(3) %ptr to ptr %val = load i32, ptr %genptr ret i32 %val @@ -34,7 +34,7 @@ define i32 @conv3(ptr addrspace(4) %ptr) { ; PTRCONV: cvt.u64.u32 ; NOPTRCONV-NOT: cvt.u64.u32 ; CLS64: cvta.const.u64 -; ALL: ld.u32 +; ALL: ld.b32 %genptr = addrspacecast ptr addrspace(4) %ptr to ptr %val = load i32, ptr %genptr ret i32 %val @@ -46,7 +46,7 @@ define i32 @conv4(ptr addrspace(5) %ptr) { ; PTRCONV: cvt.u64.u32 ; NOPTRCONV-NOT: cvt.u64.u32 ; CLS64: cvta.local.u64 -; ALL: ld.u32 +; ALL: ld.b32 %genptr = addrspacecast ptr addrspace(5) %ptr to ptr %val = load i32, ptr %genptr ret i32 %val @@ -57,7 +57,7 @@ define i32 @conv5(ptr %ptr) { ; CLS32: cvta.to.global.u32 ; ALL-NOT: cvt.u64.u32 ; CLS64: cvta.to.global.u64 -; ALL: ld.global.u32 +; ALL: ld.global.b32 %specptr = addrspacecast ptr %ptr to ptr addrspace(1) %val = load i32, ptr addrspace(1) %specptr ret i32 %val @@ -69,7 +69,7 @@ define i32 @conv6(ptr %ptr) { ; CLS64: cvta.to.shared.u64 ; PTRCONV: cvt.u32.u64 ; NOPTRCONV-NOT: cvt.u32.u64 -; ALL: ld.shared.u32 +; ALL: ld.shared.b32 %specptr = addrspacecast ptr %ptr to ptr addrspace(3) %val = load i32, ptr addrspace(3) %specptr ret i32 %val @@ -81,7 +81,7 @@ define i32 @conv7(ptr %ptr) { ; CLS64: cvta.to.const.u64 ; PTRCONV: cvt.u32.u64 ; NOPTRCONV-NOT: cvt.u32.u64 -; ALL: ld.const.u32 +; ALL: ld.const.b32 %specptr = addrspacecast ptr %ptr to ptr addrspace(4) %val = load i32, ptr addrspace(4) %specptr ret i32 %val @@ -93,7 +93,7 @@ define i32 @conv8(ptr %ptr) { ; CLS64: cvta.to.local.u64 ; PTRCONV: cvt.u32.u64 ; NOPTRCONV-NOT: cvt.u32.u64 -; ALL: ld.local.u32 +; ALL: ld.local.b32 %specptr = addrspacecast ptr %ptr to ptr addrspace(5) %val = load i32, ptr addrspace(5) %specptr ret i32 %val @@ -104,7 +104,7 @@ define i32 @conv9(ptr addrspace(1) %ptr) { ; CLS32: // implicit-def: %[[ADDR:r[0-9]+]] ; PTRCONV: // implicit-def: %[[ADDR:r[0-9]+]] ; NOPTRCONV: // implicit-def: %[[ADDR:rd[0-9]+]] -; ALL: ld.shared.u32 %r{{[0-9]+}}, [%[[ADDR]]] +; ALL: ld.shared.b32 %r{{[0-9]+}}, [%[[ADDR]]] %specptr = addrspacecast ptr addrspace(1) %ptr to ptr addrspace(3) %val = load i32, ptr addrspace(3) %specptr ret i32 %val @@ -120,8 +120,8 @@ define void @split1To0(ptr nocapture noundef readonly %xs) { ; CLS32: cvta.global.u32 ; CLS64: cvta.global.u64 ; CLS64: cvta.global.u64 -; ALL: st.u32 -; ALL: st.u32 +; ALL: st.b32 +; ALL: st.b32 %vec_addr = load <2 x ptr addrspace(1)>, ptr %xs, align 16 %addrspacecast = addrspacecast <2 x ptr addrspace(1)> %vec_addr to <2 x ptr> %extractelement0 = extractelement <2 x ptr> %addrspacecast, i64 0 @@ -139,8 +139,8 @@ define void @split0To1(ptr nocapture noundef readonly %xs) { ; CLS32: cvta.to.global.u32 ; CLS64: cvta.to.global.u64 ; CLS64: cvta.to.global.u64 -; ALL: st.global.u32 -; ALL: st.global.u32 +; ALL: st.global.b32 +; ALL: st.global.b32 %vec_addr = load <2 x ptr>, ptr %xs, align 16 %addrspacecast = addrspacecast <2 x ptr> %vec_addr to <2 x ptr addrspace(1)> %extractelement0 = extractelement <2 x ptr addrspace(1)> %addrspacecast, i64 0 @@ -162,9 +162,9 @@ define void @widen1To0(ptr nocapture noundef readonly %xs) { ; CLS64: cvta.global.u64 ; CLS64: cvta.global.u64 -; ALL: st.u32 -; ALL: st.u32 -; ALL: st.u32 +; ALL: st.b32 +; ALL: st.b32 +; ALL: st.b32 %vec_addr = load <3 x ptr addrspace(1)>, ptr %xs, align 16 %addrspacecast = addrspacecast <3 x ptr addrspace(1)> %vec_addr to <3 x ptr> %extractelement0 = extractelement <3 x ptr> %addrspacecast, i64 0 @@ -188,9 +188,9 @@ define void @widen0To1(ptr nocapture noundef readonly %xs) { ; CLS64: cvta.to.global.u64 ; CLS64: cvta.to.global.u64 -; ALL: st.global.u32 -; ALL: st.global.u32 -; ALL: st.global.u32 +; ALL: st.global.b32 +; ALL: st.global.b32 +; ALL: st.global.b32 %vec_addr = load <3 x ptr>, ptr %xs, align 16 %addrspacecast = addrspacecast <3 x ptr> %vec_addr to <3 x ptr addrspace(1)> %extractelement0 = extractelement <3 x ptr addrspace(1)> %addrspacecast, i64 0 diff --git a/llvm/test/CodeGen/NVPTX/aggregate-return.ll b/llvm/test/CodeGen/NVPTX/aggregate-return.ll index cda7d38ccb0b..72c302433f08 100644 --- a/llvm/test/CodeGen/NVPTX/aggregate-return.ll +++ b/llvm/test/CodeGen/NVPTX/aggregate-return.ll @@ -10,9 +10,9 @@ define void @test_v2f32(<2 x float> %input, ptr %output) { ; CHECK-LABEL: @test_v2f32 %call = tail call <2 x float> @barv(<2 x float> %input) ; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0]; +; CHECK: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0]; store <2 x float> %call, ptr %output, align 8 -; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} +; CHECK: st.v2.b32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]} ret void } @@ -21,15 +21,15 @@ define void @test_v3f32(<3 x float> %input, ptr %output) { ; %call = tail call <3 x float> @barv3(<3 x float> %input) ; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0]; -; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0]; +; CHECK-DAG: ld.param.b32 [[E2:%f[0-9]+]], [retval0+8]; ; Make sure we don't load more values than than we need to. -; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12]; +; CHECK-NOT: ld.param.b32 [[E3:%f[0-9]+]], [retval0+12]; store <3 x float> %call, ptr %output, align 8 -; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8], +; CHECK-DAG: st.b32 [{{%rd[0-9]}}+8], ; -- This is suboptimal. We should do st.v2.f32 instead ; of combining 2xf32 info i64. -; CHECK-DAG: st.u64 [{{%rd[0-9]}}], +; CHECK-DAG: st.b64 [{{%rd[0-9]}}], ; CHECK: ret; ret void } @@ -38,12 +38,12 @@ define void @test_a2f32([2 x float] %input, ptr %output) { ; CHECK-LABEL: @test_a2f32 %call = tail call [2 x float] @bara([2 x float] %input) ; CHECK: .param .align 4 .b8 retval0[8]; -; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.f32 [[ELEMA2:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[ELEMA1:%f[0-9]+]], [retval0]; +; CHECK-DAG: ld.param.b32 [[ELEMA2:%f[0-9]+]], [retval0+4]; store [2 x float] %call, ptr %output, align 4 ; CHECK: } -; CHECK-DAG: st.f32 [{{%rd[0-9]+}}], [[ELEMA1]] -; CHECK-DAG: st.f32 [{{%rd[0-9]+}}+4], [[ELEMA2]] +; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMA1]] +; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMA2]] ret void ; CHECK: ret } @@ -52,12 +52,12 @@ define void @test_s2f32({float, float} %input, ptr %output) { ; CHECK-LABEL: @test_s2f32 %call = tail call {float, float} @bars({float, float} %input) ; CHECK: .param .align 4 .b8 retval0[8]; -; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.f32 [[ELEMS2:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[ELEMS1:%f[0-9]+]], [retval0]; +; CHECK-DAG: ld.param.b32 [[ELEMS2:%f[0-9]+]], [retval0+4]; store {float, float} %call, ptr %output, align 4 ; CHECK: } -; CHECK-DAG: st.f32 [{{%rd[0-9]+}}], [[ELEMS1]] -; CHECK-DAG: st.f32 [{{%rd[0-9]+}}+4], [[ELEMS2]] +; CHECK-DAG: st.b32 [{{%rd[0-9]+}}], [[ELEMS1]] +; CHECK-DAG: st.b32 [{{%rd[0-9]+}}+4], [[ELEMS2]] ret void ; CHECK: ret } diff --git a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll index 5949de335b8c..53c741bd6cb2 100644 --- a/llvm/test/CodeGen/NVPTX/and-or-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/and-or-setcc.ll @@ -12,8 +12,8 @@ define i1 @and_ord(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [and_ord_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [and_ord_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [and_ord_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [and_ord_param_1]; ; CHECK-NEXT: setp.num.f32 %p1, %f1, %f2; ; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -32,8 +32,8 @@ define i1 @or_uno(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [or_uno_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [or_uno_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [or_uno_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [or_uno_param_1]; ; CHECK-NEXT: setp.nan.f32 %p1, %f1, %f2; ; CHECK-NEXT: selp.b32 %r1, 1, 0, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; diff --git a/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll b/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll index 9d7bd4a3ed6c..713ceb2d7d57 100644 --- a/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll +++ b/llvm/test/CodeGen/NVPTX/anonymous-fn-param.ll @@ -6,7 +6,7 @@ ; CHECK: .func (.param .b32 func_retval0) __unnamed_1( ; CHECK-NEXT: .param .b32 __unnamed_1_param_0 -; CHECK: ld.param.u32 {{%r[0-9]+}}, [__unnamed_1_param_0]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [__unnamed_1_param_0]; define internal i32 @0(i32 %a) { entry: @@ -16,7 +16,7 @@ entry: ; CHECK: .func (.param .b32 func_retval0) __unnamed_2( ; CHECK-NEXT: .param .b32 __unnamed_2_param_0 -; CHECK: ld.param.u32 {{%r[0-9]+}}, [__unnamed_2_param_0]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [__unnamed_2_param_0]; define internal i32 @1(i32 %a) { entry: diff --git a/llvm/test/CodeGen/NVPTX/applypriority.ll b/llvm/test/CodeGen/NVPTX/applypriority.ll index af161d82a25e..23b1bda9a32b 100644 --- a/llvm/test/CodeGen/NVPTX/applypriority.ll +++ b/llvm/test/CodeGen/NVPTX/applypriority.ll @@ -13,7 +13,7 @@ define void @applypriority_global_L2(ptr addrspace(1) %global_ptr, i64 %size) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_global_L2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [applypriority_global_L2_param_0]; ; CHECK-PTX64-NEXT: applypriority.global.L2::evict_normal [%rd1], 128; ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.applypriority.global.L2.evict.normal(ptr addrspace(1) %global_ptr, i64 128) @@ -26,7 +26,7 @@ define void @applypriority_L2(ptr %ptr, i64 %size) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [applypriority_L2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [applypriority_L2_param_0]; ; CHECK-PTX64-NEXT: applypriority.L2::evict_normal [%rd1], 128; ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.applypriority.L2.evict.normal(ptr %ptr, i64 128) diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll index b14295020bc0..22a7177650ee 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll @@ -15,13 +15,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_param_0]; ; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3]; ; CHECK-NEXT: atom.add.noftz.f16 %rs2, [%r1], %rs1; -; CHECK-NEXT: ld.param.u32 %r2, [test_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_param_1]; ; CHECK-NEXT: mov.b16 %rs3, 0x3C00; ; CHECK-NEXT: atom.add.noftz.f16 %rs4, [%r1], %rs3; -; CHECK-NEXT: ld.param.u32 %r3, [test_param_2]; +; CHECK-NEXT: ld.param.b32 %r3, [test_param_2]; ; CHECK-NEXT: atom.global.add.noftz.f16 %rs5, [%r2], %rs1; ; CHECK-NEXT: atom.shared.add.noftz.f16 %rs6, [%r3], %rs1; ; CHECK-NEXT: ret; @@ -32,13 +32,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECK64-NEXT: .reg .b64 %rd<4>; ; CHECK64-EMPTY: ; CHECK64-NEXT: // %bb.0: -; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0]; +; CHECK64-NEXT: ld.param.b64 %rd1, [test_param_0]; ; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3]; ; CHECK64-NEXT: atom.add.noftz.f16 %rs2, [%rd1], %rs1; -; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1]; +; CHECK64-NEXT: ld.param.b64 %rd2, [test_param_1]; ; CHECK64-NEXT: mov.b16 %rs3, 0x3C00; ; CHECK64-NEXT: atom.add.noftz.f16 %rs4, [%rd1], %rs3; -; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2]; +; CHECK64-NEXT: ld.param.b64 %rd3, [test_param_2]; ; CHECK64-NEXT: atom.global.add.noftz.f16 %rs5, [%rd2], %rs1; ; CHECK64-NEXT: atom.shared.add.noftz.f16 %rs6, [%rd3], %rs1; ; CHECK64-NEXT: ret; @@ -51,16 +51,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-EMPTY: ; CHECKPTX62-NEXT: // %bb.0: ; CHECKPTX62-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX62-NEXT: ld.param.u32 %r23, [test_param_2]; -; CHECKPTX62-NEXT: ld.param.u32 %r22, [test_param_1]; -; CHECKPTX62-NEXT: ld.param.u32 %r24, [test_param_0]; +; CHECKPTX62-NEXT: ld.param.b32 %r23, [test_param_2]; +; CHECKPTX62-NEXT: ld.param.b32 %r22, [test_param_1]; +; CHECKPTX62-NEXT: ld.param.b32 %r24, [test_param_0]; ; CHECKPTX62-NEXT: and.b32 %r1, %r24, -4; ; CHECKPTX62-NEXT: and.b32 %r25, %r24, 3; ; CHECKPTX62-NEXT: shl.b32 %r2, %r25, 3; ; CHECKPTX62-NEXT: mov.b32 %r26, 65535; ; CHECKPTX62-NEXT: shl.b32 %r27, %r26, %r2; ; CHECKPTX62-NEXT: not.b32 %r3, %r27; -; CHECKPTX62-NEXT: ld.u32 %r54, [%r1]; +; CHECKPTX62-NEXT: ld.b32 %r54, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r28, %r54, %r2; @@ -75,7 +75,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: mov.b32 %r54, %r6; ; CHECKPTX62-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX62-NEXT: // %bb.2: // %atomicrmw.end44 -; CHECKPTX62-NEXT: ld.u32 %r55, [%r1]; +; CHECKPTX62-NEXT: ld.b32 %r55, [%r1]; ; CHECKPTX62-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r33, %r55, %r2; @@ -97,7 +97,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: mov.b32 %r39, 65535; ; CHECKPTX62-NEXT: shl.b32 %r40, %r39, %r11; ; CHECKPTX62-NEXT: not.b32 %r12, %r40; -; CHECKPTX62-NEXT: ld.global.u32 %r56, [%r10]; +; CHECKPTX62-NEXT: ld.global.b32 %r56, [%r10]; ; CHECKPTX62-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r41, %r56, %r11; @@ -118,7 +118,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half % ; CHECKPTX62-NEXT: mov.b32 %r47, 65535; ; CHECKPTX62-NEXT: shl.b32 %r48, %r47, %r17; ; CHECKPTX62-NEXT: not.b32 %r18, %r48; -; CHECKPTX62-NEXT: ld.shared.u32 %r57, [%r16]; +; CHECKPTX62-NEXT: ld.shared.b32 %r57, [%r16]; ; CHECKPTX62-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX62-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX62-NEXT: shr.u32 %r49, %r57, %r17; diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll index f27e574724ce..b5a4f9461145 100644 --- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll @@ -15,13 +15,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_param_0]; ; CHECK-NEXT: ld.param.b16 %rs1, [test_param_3]; ; CHECK-NEXT: atom.add.noftz.bf16 %rs2, [%r1], %rs1; -; CHECK-NEXT: ld.param.u32 %r2, [test_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_param_1]; ; CHECK-NEXT: mov.b16 %rs3, 0x3F80; ; CHECK-NEXT: atom.add.noftz.bf16 %rs4, [%r1], %rs3; -; CHECK-NEXT: ld.param.u32 %r3, [test_param_2]; +; CHECK-NEXT: ld.param.b32 %r3, [test_param_2]; ; CHECK-NEXT: atom.global.add.noftz.bf16 %rs5, [%r2], %rs1; ; CHECK-NEXT: atom.shared.add.noftz.bf16 %rs6, [%r3], %rs1; ; CHECK-NEXT: ret; @@ -32,13 +32,13 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECK64-NEXT: .reg .b64 %rd<4>; ; CHECK64-EMPTY: ; CHECK64-NEXT: // %bb.0: -; CHECK64-NEXT: ld.param.u64 %rd1, [test_param_0]; +; CHECK64-NEXT: ld.param.b64 %rd1, [test_param_0]; ; CHECK64-NEXT: ld.param.b16 %rs1, [test_param_3]; ; CHECK64-NEXT: atom.add.noftz.bf16 %rs2, [%rd1], %rs1; -; CHECK64-NEXT: ld.param.u64 %rd2, [test_param_1]; +; CHECK64-NEXT: ld.param.b64 %rd2, [test_param_1]; ; CHECK64-NEXT: mov.b16 %rs3, 0x3F80; ; CHECK64-NEXT: atom.add.noftz.bf16 %rs4, [%rd1], %rs3; -; CHECK64-NEXT: ld.param.u64 %rd3, [test_param_2]; +; CHECK64-NEXT: ld.param.b64 %rd3, [test_param_2]; ; CHECK64-NEXT: atom.global.add.noftz.bf16 %rs5, [%rd2], %rs1; ; CHECK64-NEXT: atom.shared.add.noftz.bf16 %rs6, [%rd3], %rs1; ; CHECK64-NEXT: ret; @@ -51,16 +51,16 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-EMPTY: ; CHECKPTX71-NEXT: // %bb.0: ; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3]; -; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2]; -; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1]; -; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0]; +; CHECKPTX71-NEXT: ld.param.b32 %r23, [test_param_2]; +; CHECKPTX71-NEXT: ld.param.b32 %r22, [test_param_1]; +; CHECKPTX71-NEXT: ld.param.b32 %r24, [test_param_0]; ; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4; ; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3; ; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3; ; CHECKPTX71-NEXT: mov.b32 %r26, 65535; ; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2; ; CHECKPTX71-NEXT: not.b32 %r3, %r27; -; CHECKPTX71-NEXT: ld.u32 %r54, [%r1]; +; CHECKPTX71-NEXT: ld.b32 %r54, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2; @@ -76,7 +76,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: mov.b32 %r54, %r6; ; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1; ; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44 -; CHECKPTX71-NEXT: ld.u32 %r55, [%r1]; +; CHECKPTX71-NEXT: ld.b32 %r55, [%r1]; ; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2; @@ -98,7 +98,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: mov.b32 %r39, 65535; ; CHECKPTX71-NEXT: shl.b32 %r40, %r39, %r11; ; CHECKPTX71-NEXT: not.b32 %r12, %r40; -; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10]; +; CHECKPTX71-NEXT: ld.global.b32 %r56, [%r10]; ; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9 ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11; @@ -120,7 +120,7 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat ; CHECKPTX71-NEXT: mov.b32 %r47, 65535; ; CHECKPTX71-NEXT: shl.b32 %r48, %r47, %r17; ; CHECKPTX71-NEXT: not.b32 %r18, %r48; -; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16]; +; CHECKPTX71-NEXT: ld.shared.b32 %r57, [%r16]; ; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start ; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17; diff --git a/llvm/test/CodeGen/NVPTX/atomics.ll b/llvm/test/CodeGen/NVPTX/atomics.ll index 16de80d55a05..6c5af3da5d9b 100644 --- a/llvm/test/CodeGen/NVPTX/atomics.ll +++ b/llvm/test/CodeGen/NVPTX/atomics.ll @@ -11,8 +11,8 @@ define i32 @atom0(ptr %addr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom0_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom0_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom0_param_1]; ; CHECK-NEXT: atom.add.u32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -27,8 +27,8 @@ define i64 @atom1(ptr %addr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom1_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom1_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom1_param_1]; ; CHECK-NEXT: atom.add.u64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -44,8 +44,8 @@ define i32 @atom2(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom2_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom2_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom2_param_1]; ; CHECK-NEXT: neg.s32 %r2, %r1; ; CHECK-NEXT: atom.add.u32 %r3, [%rd1], %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -61,8 +61,8 @@ define i64 @atom3(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom3_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom3_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom3_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom3_param_1]; ; CHECK-NEXT: neg.s64 %rd3, %rd2; ; CHECK-NEXT: atom.add.u64 %rd4, [%rd1], %rd3; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; @@ -79,8 +79,8 @@ define i32 @atom4(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom4_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom4_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom4_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom4_param_1]; ; CHECK-NEXT: atom.and.b32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -95,8 +95,8 @@ define i64 @atom5(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom5_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom5_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom5_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom5_param_1]; ; CHECK-NEXT: atom.and.b64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -123,8 +123,8 @@ define i32 @atom8(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom8_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom8_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom8_param_1]; ; CHECK-NEXT: atom.or.b32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -139,8 +139,8 @@ define i64 @atom9(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom9_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom9_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom9_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom9_param_1]; ; CHECK-NEXT: atom.or.b64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -156,8 +156,8 @@ define i32 @atom10(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom10_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom10_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom10_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom10_param_1]; ; CHECK-NEXT: atom.xor.b32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -172,8 +172,8 @@ define i64 @atom11(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom11_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom11_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom11_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom11_param_1]; ; CHECK-NEXT: atom.xor.b64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -189,8 +189,8 @@ define i32 @atom12(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom12_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom12_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom12_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom12_param_1]; ; CHECK-NEXT: atom.max.s32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -205,8 +205,8 @@ define i64 @atom13(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom13_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom13_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom13_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom13_param_1]; ; CHECK-NEXT: atom.max.s64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -222,8 +222,8 @@ define i32 @atom14(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom14_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom14_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom14_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom14_param_1]; ; CHECK-NEXT: atom.min.s32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -238,8 +238,8 @@ define i64 @atom15(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom15_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom15_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom15_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom15_param_1]; ; CHECK-NEXT: atom.min.s64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -255,8 +255,8 @@ define i32 @atom16(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom16_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom16_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom16_param_1]; ; CHECK-NEXT: atom.max.u32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -271,8 +271,8 @@ define i64 @atom17(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom17_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom17_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom17_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom17_param_1]; ; CHECK-NEXT: atom.max.u64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -288,8 +288,8 @@ define i32 @atom18(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom18_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom18_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom18_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom18_param_1]; ; CHECK-NEXT: atom.min.u32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -304,8 +304,8 @@ define i64 @atom19(ptr %subr, i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom19_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [atom19_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom19_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atom19_param_1]; ; CHECK-NEXT: atom.min.u64 %rd3, [%rd1], %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -320,8 +320,8 @@ define i32 @atom20(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom20_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom20_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom20_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom20_param_1]; ; CHECK-NEXT: atom.inc.u32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -336,8 +336,8 @@ define i32 @atom21(ptr %subr, i32 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atom21_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [atom21_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atom21_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [atom21_param_1]; ; CHECK-NEXT: atom.dec.u32 %r2, [%rd1], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -355,10 +355,10 @@ define float @atomic_add_f32_generic(ptr %addr, float %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_generic_param_0]; -; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_generic_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_generic_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [atomic_add_f32_generic_param_1]; ; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p0(ptr %addr, float %val) ret float %ret @@ -374,10 +374,10 @@ define float @atomic_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace1_param_0]; -; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace1_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_addrspace1_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [atomic_add_f32_addrspace1_param_1]; ; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p1(ptr addrspace(1) %addr, float %val) ret float %ret @@ -393,10 +393,10 @@ define float @atomic_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomic_add_f32_addrspace3_param_0]; -; CHECK-NEXT: ld.param.f32 %f1, [atomic_add_f32_addrspace3_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomic_add_f32_addrspace3_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [atomic_add_f32_addrspace3_param_1]; ; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.atomic.load.add.f32.p3(ptr addrspace(3) %addr, float %val) ret float %ret @@ -410,10 +410,10 @@ define float @atomicrmw_add_f32_generic(ptr %addr, float %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_generic_param_0]; -; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_generic_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_generic_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [atomicrmw_add_f32_generic_param_1]; ; CHECK-NEXT: atom.add.f32 %f2, [%rd1], %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr %addr, float %val seq_cst ret float %ret @@ -431,7 +431,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [atomicrmw_add_f16_generic_param_1]; -; CHECK-NEXT: ld.param.u64 %rd2, [atomicrmw_add_f16_generic_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [atomicrmw_add_f16_generic_param_0]; ; CHECK-NEXT: and.b64 %rd1, %rd2, -4; ; CHECK-NEXT: cvt.u32.u64 %r6, %rd2; ; CHECK-NEXT: and.b32 %r7, %r6, 3; @@ -439,7 +439,7 @@ define half @atomicrmw_add_f16_generic(ptr %addr, half %val) { ; CHECK-NEXT: mov.b32 %r8, 65535; ; CHECK-NEXT: shl.b32 %r9, %r8, %r1; ; CHECK-NEXT: not.b32 %r2, %r9; -; CHECK-NEXT: ld.u32 %r16, [%rd1]; +; CHECK-NEXT: ld.b32 %r16, [%rd1]; ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-NEXT: $L__BB24_1: // %atomicrmw.start ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -474,10 +474,10 @@ define float @atomicrmw_add_f32_addrspace1(ptr addrspace(1) %addr, float %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace1_param_0]; -; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace1_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace1_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [atomicrmw_add_f32_addrspace1_param_1]; ; CHECK-NEXT: atom.global.add.f32 %f2, [%rd1], %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr addrspace(1) %addr, float %val seq_cst ret float %ret @@ -491,10 +491,10 @@ define float @atomicrmw_add_f32_addrspace3(ptr addrspace(3) %addr, float %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomicrmw_add_f32_addrspace3_param_0]; -; CHECK-NEXT: ld.param.f32 %f1, [atomicrmw_add_f32_addrspace3_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomicrmw_add_f32_addrspace3_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [atomicrmw_add_f32_addrspace3_param_1]; ; CHECK-NEXT: atom.shared.add.f32 %f2, [%rd1], %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = atomicrmw fadd ptr addrspace(3) %addr, float %val seq_cst ret float %ret @@ -508,10 +508,10 @@ define i32 @atomic_cmpxchg_i32(ptr %addr, i32 %cmp, i32 %new) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomic_cmpxchg_i32_param_0]; ; CHECK-NEXT: membar.sys; -; CHECK-NEXT: ld.param.u32 %r1, [atomic_cmpxchg_i32_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [atomic_cmpxchg_i32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [atomic_cmpxchg_i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [atomic_cmpxchg_i32_param_2]; ; CHECK-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -526,10 +526,10 @@ define i64 @atomic_cmpxchg_i64(ptr %addr, i64 %cmp, i64 %new) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [atomic_cmpxchg_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [atomic_cmpxchg_i64_param_0]; ; CHECK-NEXT: membar.sys; -; CHECK-NEXT: ld.param.u64 %rd2, [atomic_cmpxchg_i64_param_1]; -; CHECK-NEXT: ld.param.u64 %rd3, [atomic_cmpxchg_i64_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [atomic_cmpxchg_i64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [atomic_cmpxchg_i64_param_2]; ; CHECK-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/barrier.ll b/llvm/test/CodeGen/NVPTX/barrier.ll index a8f1018c5016..05bdc9087f57 100644 --- a/llvm/test/CodeGen/NVPTX/barrier.ll +++ b/llvm/test/CodeGen/NVPTX/barrier.ll @@ -7,8 +7,8 @@ declare void @llvm.nvvm.barrier.sync.cnt(i32, i32) ; CHECK-LABEL: .func{{.*}}barrier_sync define void @barrier_sync(i32 %id, i32 %cnt) { - ; CHECK: ld.param.u32 [[ID:%r[0-9]+]], [barrier_sync_param_0]; - ; CHECK: ld.param.u32 [[CNT:%r[0-9]+]], [barrier_sync_param_1]; + ; CHECK: ld.param.b32 [[ID:%r[0-9]+]], [barrier_sync_param_0]; + ; CHECK: ld.param.b32 [[CNT:%r[0-9]+]], [barrier_sync_param_1]; ; CHECK: barrier.sync [[ID]], [[CNT]]; call void @llvm.nvvm.barrier.sync.cnt(i32 %id, i32 %cnt) diff --git a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll index 6be13c3a6fde..1ed191fcb9ff 100644 --- a/llvm/test/CodeGen/NVPTX/bf16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16-instructions.ll @@ -22,10 +22,10 @@ define bfloat @test_fadd(bfloat %0, bfloat %1) { ; SM70-NEXT: .reg .b32 %f<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_fadd_param_1]; +; SM70-NEXT: ld.param.b16 %r1, [test_fadd_param_1]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; -; SM70-NEXT: ld.param.u16 %r3, [test_fadd_param_0]; +; SM70-NEXT: ld.param.b16 %r3, [test_fadd_param_0]; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f2, %r4; ; SM70-NEXT: add.rn.f32 %f3, %f2, %f1; @@ -90,10 +90,10 @@ define bfloat @test_fsub(bfloat %0, bfloat %1) { ; SM70-NEXT: .reg .b32 %f<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_fsub_param_1]; +; SM70-NEXT: ld.param.b16 %r1, [test_fsub_param_1]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; -; SM70-NEXT: ld.param.u16 %r3, [test_fsub_param_0]; +; SM70-NEXT: ld.param.b16 %r3, [test_fsub_param_0]; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f2, %r4; ; SM70-NEXT: sub.rn.f32 %f3, %f2, %f1; @@ -569,10 +569,10 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_fpext_float_param_0]; +; SM70-NEXT: ld.param.b16 %r1, [test_fpext_float_param_0]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; -; SM70-NEXT: st.param.f32 [func_retval0], %f1; +; SM70-NEXT: st.param.b32 [func_retval0], %f1; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_fpext_float( @@ -583,7 +583,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-NEXT: // %bb.0: ; SM80-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; ; SM80-NEXT: cvt.f32.bf16 %f1, %rs1; -; SM80-NEXT: st.param.f32 [func_retval0], %f1; +; SM80-NEXT: st.param.b32 [func_retval0], %f1; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_fpext_float( @@ -594,7 +594,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM80-FTZ-NEXT: // %bb.0: ; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f1, %rs1; -; SM80-FTZ-NEXT: st.param.f32 [func_retval0], %f1; +; SM80-FTZ-NEXT: st.param.b32 [func_retval0], %f1; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_fpext_float( @@ -605,7 +605,7 @@ define float @test_fpext_float(bfloat %a) #0 { ; SM90-NEXT: // %bb.0: ; SM90-NEXT: ld.param.b16 %rs1, [test_fpext_float_param_0]; ; SM90-NEXT: cvt.f32.bf16 %f1, %rs1; -; SM90-NEXT: st.param.f32 [func_retval0], %f1; +; SM90-NEXT: st.param.b32 [func_retval0], %f1; ; SM90-NEXT: ret; %r = fpext bfloat %a to float ret float %r @@ -620,7 +620,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; +; SM70-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0]; ; SM70-NEXT: mov.b32 %r1, %f1; ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; ; SM70-NEXT: add.s32 %r3, %r2, %r1; @@ -638,7 +638,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-NEXT: .reg .b32 %f<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; +; SM80-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0]; ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM80-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-NEXT: ret; @@ -649,7 +649,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM80-FTZ-NEXT: .reg .b32 %f<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; +; SM80-FTZ-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0]; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; ; SM80-FTZ-NEXT: ret; @@ -660,7 +660,7 @@ define bfloat @test_fptrunc_float(float %a) #0 { ; SM90-NEXT: .reg .b32 %f<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.f32 %f1, [test_fptrunc_float_param_0]; +; SM90-NEXT: ld.param.b32 %f1, [test_fptrunc_float_param_0]; ; SM90-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM90-NEXT: st.param.b16 [func_retval0], %rs1; ; SM90-NEXT: ret; @@ -677,7 +677,7 @@ define bfloat @test_fadd_imm_1(bfloat %a) #0 { ; SM70-NEXT: .reg .b32 %f<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_fadd_imm_1_param_0]; +; SM70-NEXT: ld.param.b16 %r1, [test_fadd_imm_1_param_0]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; @@ -738,8 +738,8 @@ define bfloat @test_select_cc_bf16_f64(double %a, double %b, bfloat %c, bfloat % ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [test_select_cc_bf16_f64_param_0]; -; CHECK-NEXT: ld.param.f64 %fd2, [test_select_cc_bf16_f64_param_1]; +; CHECK-NEXT: ld.param.b64 %fd1, [test_select_cc_bf16_f64_param_0]; +; CHECK-NEXT: ld.param.b64 %fd2, [test_select_cc_bf16_f64_param_1]; ; CHECK-NEXT: setp.lt.f64 %p1, %fd1, %fd2; ; CHECK-NEXT: ld.param.b16 %rs1, [test_select_cc_bf16_f64_param_2]; ; CHECK-NEXT: ld.param.b16 %rs2, [test_select_cc_bf16_f64_param_3]; @@ -760,7 +760,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM70-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM70-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; SM70-NEXT: mov.b32 {%rs3, %rs4}, %r2; @@ -790,8 +790,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM70-NEXT: cvt.u32.u16 %r19, %rs1; ; SM70-NEXT: shl.b32 %r20, %r19, 16; ; SM70-NEXT: mov.b32 %f8, %r20; -; SM70-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; -; SM70-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; +; SM70-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5}; +; SM70-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM70-NEXT: ret; ; ; SM80-LABEL: test_extload_bf16x8( @@ -802,7 +802,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-NEXT: .reg .b64 %rd<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; +; SM80-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; @@ -816,8 +816,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-NEXT: cvt.f32.bf16 %f6, %rs3; ; SM80-NEXT: cvt.f32.bf16 %f7, %rs2; ; SM80-NEXT: cvt.f32.bf16 %f8, %rs1; -; SM80-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; -; SM80-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; +; SM80-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5}; +; SM80-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM80-NEXT: ret; ; ; SM80-FTZ-LABEL: test_extload_bf16x8( @@ -828,7 +828,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: .reg .b64 %rd<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; +; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM80-FTZ-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM80-FTZ-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; SM80-FTZ-NEXT: mov.b32 {%rs3, %rs4}, %r2; @@ -842,8 +842,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f6, %rs3; ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f7, %rs2; ; SM80-FTZ-NEXT: cvt.ftz.f32.bf16 %f8, %rs1; -; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; -; SM80-FTZ-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; +; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5}; +; SM80-FTZ-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM80-FTZ-NEXT: ret; ; ; SM90-LABEL: test_extload_bf16x8( @@ -854,7 +854,7 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [test_extload_bf16x8_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [test_extload_bf16x8_param_0]; ; SM90-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; SM90-NEXT: mov.b32 {%rs3, %rs4}, %r2; @@ -868,8 +868,8 @@ define <8 x float> @test_extload_bf16x8(ptr addrspace(3) noundef %arg) #0 { ; SM90-NEXT: cvt.f32.bf16 %f6, %rs3; ; SM90-NEXT: cvt.f32.bf16 %f7, %rs2; ; SM90-NEXT: cvt.f32.bf16 %f8, %rs1; -; SM90-NEXT: st.param.v4.f32 [func_retval0], {%f8, %f7, %f6, %f5}; -; SM90-NEXT: st.param.v4.f32 [func_retval0+16], {%f4, %f3, %f2, %f1}; +; SM90-NEXT: st.param.v4.b32 [func_retval0], {%f8, %f7, %f6, %f5}; +; SM90-NEXT: st.param.v4.b32 [func_retval0+16], {%f4, %f3, %f2, %f1}; ; SM90-NEXT: ret; %load = load <8 x bfloat>, ptr addrspace(3) %arg, align 16 %res = fpext <8 x bfloat> %load to <8 x float> @@ -884,7 +884,7 @@ define i16 @test_fptosi_i16(bfloat %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_fptosi_i16_param_0]; +; SM70-NEXT: ld.param.b16 %r1, [test_fptosi_i16_param_0]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: cvt.rzi.s16.f32 %rs1, %f1; @@ -943,7 +943,7 @@ define i16 @test_fptoui_i16(bfloat %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_fptoui_i16_param_0]; +; SM70-NEXT: ld.param.b16 %r1, [test_fptoui_i16_param_0]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: cvt.rzi.u16.f32 %rs1, %f1; @@ -1003,7 +1003,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; ; SM70-NEXT: cvt.rn.f32.s16 %f1, %rs1; ; SM70-NEXT: mov.b32 %r1, %f1; ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; @@ -1022,7 +1022,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-NEXT: .reg .b32 %f<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; +; SM80-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; ; SM80-NEXT: cvt.rn.f32.s16 %f1, %rs1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; ; SM80-NEXT: st.param.b16 [func_retval0], %rs2; @@ -1034,7 +1034,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: .reg .b32 %f<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; +; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.s16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; @@ -1045,7 +1045,7 @@ define bfloat @test_sitofp_i16(i16 %a) { ; SM90-NEXT: .reg .b16 %rs<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [test_sitofp_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [test_sitofp_i16_param_0]; ; SM90-NEXT: cvt.rn.bf16.s16 %rs2, %rs1; ; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; @@ -1062,7 +1062,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; ; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM70-NEXT: mov.b32 %r1, %f1; ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; @@ -1081,7 +1081,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-NEXT: .reg .b32 %f<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; +; SM80-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; ; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; ; SM80-NEXT: st.param.b16 [func_retval0], %rs2; @@ -1093,7 +1093,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM80-FTZ-NEXT: .reg .b32 %f<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; +; SM80-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; @@ -1104,7 +1104,7 @@ define bfloat @test_uitofp_i8(i8 %a) { ; SM90-NEXT: .reg .b16 %rs<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i8_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [test_uitofp_i8_param_0]; ; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; ; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; @@ -1121,7 +1121,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; ; SM70-NEXT: and.b16 %rs2, %rs1, 1; ; SM70-NEXT: setp.ne.b16 %p1, %rs2, 0; ; SM70-NEXT: selp.b32 %r1, 1, 0, %p1; @@ -1145,7 +1145,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-NEXT: .reg .b32 %f<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; +; SM80-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; ; SM80-NEXT: and.b16 %rs2, %rs1, 1; ; SM80-NEXT: setp.ne.b16 %p1, %rs2, 0; ; SM80-NEXT: selp.b32 %r1, 1, 0, %p1; @@ -1162,7 +1162,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM80-FTZ-NEXT: .reg .b32 %f<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; +; SM80-FTZ-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; ; SM80-FTZ-NEXT: and.b16 %rs2, %rs1, 1; ; SM80-FTZ-NEXT: setp.ne.b16 %p1, %rs2, 0; ; SM80-FTZ-NEXT: selp.b32 %r1, 1, 0, %p1; @@ -1178,7 +1178,7 @@ define bfloat @test_uitofp_i1(i1 %a) { ; SM90-NEXT: .reg .b32 %r<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [test_uitofp_i1_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [test_uitofp_i1_param_0]; ; SM90-NEXT: and.b16 %rs2, %rs1, 1; ; SM90-NEXT: setp.ne.b16 %p1, %rs2, 0; ; SM90-NEXT: selp.b32 %r1, 1, 0, %p1; @@ -1198,7 +1198,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; ; SM70-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM70-NEXT: mov.b32 %r1, %f1; ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; @@ -1217,7 +1217,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-NEXT: .reg .b32 %f<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; +; SM80-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; ; SM80-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs2, %f1; ; SM80-NEXT: st.param.b16 [func_retval0], %rs2; @@ -1229,7 +1229,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM80-FTZ-NEXT: .reg .b32 %f<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; +; SM80-FTZ-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u16 %f1, %rs1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs2, %f1; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs2; @@ -1240,7 +1240,7 @@ define bfloat @test_uitofp_i16(i16 %a) { ; SM90-NEXT: .reg .b16 %rs<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [test_uitofp_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [test_uitofp_i16_param_0]; ; SM90-NEXT: cvt.rn.bf16.u16 %rs2, %rs1; ; SM90-NEXT: st.param.b16 [func_retval0], %rs2; ; SM90-NEXT: ret; @@ -1257,7 +1257,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM70-NEXT: .reg .b32 %f<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; ; SM70-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM70-NEXT: mov.b32 %r2, %f1; ; SM70-NEXT: bfe.u32 %r3, %r2, 16, 1; @@ -1277,7 +1277,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-NEXT: .reg .b32 %f<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; +; SM80-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; ; SM80-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM80-NEXT: st.param.b16 [func_retval0], %rs1; @@ -1290,7 +1290,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM80-FTZ-NEXT: .reg .b32 %f<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; +; SM80-FTZ-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u32 %f1, %r1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; @@ -1302,7 +1302,7 @@ define bfloat @test_uitofp_i32(i32 %a) { ; SM90-NEXT: .reg .b32 %r<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u32 %r1, [test_uitofp_i32_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [test_uitofp_i32_param_0]; ; SM90-NEXT: cvt.rn.bf16.u32 %rs1, %r1; ; SM90-NEXT: st.param.b16 [func_retval0], %rs1; ; SM90-NEXT: ret; @@ -1320,7 +1320,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; ; SM70-NEXT: cvt.rn.f32.u64 %f1, %rd1; ; SM70-NEXT: mov.b32 %r1, %f1; ; SM70-NEXT: bfe.u32 %r2, %r1, 16, 1; @@ -1340,7 +1340,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-NEXT: .reg .b64 %rd<2>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; +; SM80-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; ; SM80-NEXT: cvt.rn.f32.u64 %f1, %rd1; ; SM80-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM80-NEXT: st.param.b16 [func_retval0], %rs1; @@ -1353,7 +1353,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM80-FTZ-NEXT: .reg .b64 %rd<2>; ; SM80-FTZ-EMPTY: ; SM80-FTZ-NEXT: // %bb.0: -; SM80-FTZ-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; +; SM80-FTZ-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; ; SM80-FTZ-NEXT: cvt.rn.f32.u64 %f1, %rd1; ; SM80-FTZ-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; SM80-FTZ-NEXT: st.param.b16 [func_retval0], %rs1; @@ -1365,7 +1365,7 @@ define bfloat @test_uitofp_i64(i64 %a) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [test_uitofp_i64_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [test_uitofp_i64_param_0]; ; SM90-NEXT: cvt.rn.bf16.u64 %rs1, %rd1; ; SM90-NEXT: st.param.b16 [func_retval0], %rs1; ; SM90-NEXT: ret; @@ -1382,7 +1382,7 @@ define bfloat @test_roundeven(bfloat %a) { ; SM70-NEXT: .reg .b32 %f<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_roundeven_param_0]; +; SM70-NEXT: ld.param.b16 %r1, [test_roundeven_param_0]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; ; SM70-NEXT: cvt.rni.f32.f32 %f2, %f1; @@ -1514,10 +1514,10 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) { ; SM70-NEXT: .reg .b32 %f<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %r1, [test_maxnum_param_1]; +; SM70-NEXT: ld.param.b16 %r1, [test_maxnum_param_1]; ; SM70-NEXT: shl.b32 %r2, %r1, 16; ; SM70-NEXT: mov.b32 %f1, %r2; -; SM70-NEXT: ld.param.u16 %r3, [test_maxnum_param_0]; +; SM70-NEXT: ld.param.b16 %r3, [test_maxnum_param_0]; ; SM70-NEXT: shl.b32 %r4, %r3, 16; ; SM70-NEXT: mov.b32 %f2, %r4; ; SM70-NEXT: max.f32 %f3, %f2, %f1; diff --git a/llvm/test/CodeGen/NVPTX/bf16.ll b/llvm/test/CodeGen/NVPTX/bf16.ll index 98fdbbbdd9c7..059736751f61 100644 --- a/llvm/test/CodeGen/NVPTX/bf16.ll +++ b/llvm/test/CodeGen/NVPTX/bf16.ll @@ -26,8 +26,8 @@ define void @test_bitcast_from_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %ou define void @test_bitcast_to_bfloat(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CHECK-LABEL: @test_bitcast_to_bfloat -; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]] +; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}] +; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]] %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to bfloat store bfloat %val_fp, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll index 677f0d795dde..cd73b78eff97 100644 --- a/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/bf16x2-instructions.ll @@ -157,7 +157,7 @@ define <2 x bfloat> @test_fneg(<2 x bfloat> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_fneg_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fneg_param_0]; ; CHECK-NEXT: xor.b32 %r2, %r1, -2147450880; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -172,9 +172,9 @@ define void @test_ldst_v2bf16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2bf16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2bf16_param_0]; ; CHECK-NEXT: ld.b32 %r1, [%rd1]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2bf16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2bf16_param_1]; ; CHECK-NEXT: st.b32 [%rd2], %r1; ; CHECK-NEXT: ret; %t1 = load <2 x bfloat>, ptr %a @@ -190,11 +190,11 @@ define void @test_ldst_v3bf16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3bf16_param_0]; -; CHECK-NEXT: ld.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3bf16_param_0]; +; CHECK-NEXT: ld.b64 %rd2, [%rd1]; ; CHECK-NEXT: mov.b64 {_, %r1}, %rd2; -; CHECK-NEXT: ld.param.u64 %rd3, [test_ldst_v3bf16_param_1]; -; CHECK-NEXT: st.u32 [%rd3], %rd2; +; CHECK-NEXT: ld.param.b64 %rd3, [test_ldst_v3bf16_param_1]; +; CHECK-NEXT: st.b32 [%rd3], %rd2; ; CHECK-NEXT: mov.b32 {%rs1, _}, %r1; ; CHECK-NEXT: st.b16 [%rd3+4], %rs1; ; CHECK-NEXT: ret; @@ -241,7 +241,7 @@ define <2 x bfloat> @test_select(<2 x bfloat> %a, <2 x bfloat> %b, i1 zeroext %c ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; ; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_1]; @@ -315,7 +315,7 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, ; SM80-NEXT: .reg .b32 %f<11>; ; SM80-EMPTY: ; SM80-NEXT: // %bb.0: -; SM80-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; +; SM80-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; ; SM80-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_2]; ; SM80-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_3]; ; SM80-NEXT: mov.b32 {%rs1, %rs2}, %r2; @@ -326,10 +326,10 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, ; SM80-NEXT: cvt.f32.bf16 %f5, %rs2; ; SM80-NEXT: cvt.f32.bf16 %f6, %rs4; ; SM80-NEXT: setp.neu.f32 %p2, %f6, %f5; -; SM80-NEXT: ld.param.v2.f32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1]; +; SM80-NEXT: ld.param.v2.b32 {%f7, %f8}, [test_select_cc_f32_bf16_param_1]; ; SM80-NEXT: selp.f32 %f9, %f2, %f8, %p2; ; SM80-NEXT: selp.f32 %f10, %f1, %f7, %p1; -; SM80-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9}; +; SM80-NEXT: st.param.v2.b32 [func_retval0], {%f10, %f9}; ; SM80-NEXT: ret; ; ; SM90-LABEL: test_select_cc_f32_bf16( @@ -339,14 +339,14 @@ define <2 x float> @test_select_cc_f32_bf16(<2 x float> %a, <2 x float> %b, ; SM90-NEXT: .reg .b32 %f<7>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; +; SM90-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_bf16_param_0]; ; SM90-NEXT: ld.param.b32 %r1, [test_select_cc_f32_bf16_param_3]; ; SM90-NEXT: ld.param.b32 %r2, [test_select_cc_f32_bf16_param_2]; ; SM90-NEXT: setp.neu.bf16x2 %p1|%p2, %r2, %r1; -; SM90-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1]; +; SM90-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_bf16_param_1]; ; SM90-NEXT: selp.f32 %f5, %f2, %f4, %p2; ; SM90-NEXT: selp.f32 %f6, %f1, %f3, %p1; -; SM90-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5}; +; SM90-NEXT: st.param.v2.b32 [func_retval0], {%f6, %f5}; ; SM90-NEXT: ret; <2 x bfloat> %c, <2 x bfloat> %d) #0 { %cc = fcmp une <2 x bfloat> %c, %d @@ -365,8 +365,8 @@ define <2 x bfloat> @test_select_cc_bf16_f32(<2 x bfloat> %a, <2 x bfloat> %b, ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_bf16_f32_param_0]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_bf16_f32_param_1]; -; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2]; -; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_bf16_f32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_bf16_f32_param_3]; ; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3; ; CHECK-NEXT: setp.neu.f32 %p2, %f2, %f4; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; @@ -389,7 +389,7 @@ define <2 x bfloat> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; ; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f2, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -409,7 +409,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x bfloat> %a) #0 { ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.bf16 %f1, %rs2; ; CHECK-NEXT: cvt.f32.bf16 %f2, %rs1; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1}; ; CHECK-NEXT: ret; %r = fpext <2 x bfloat> %a to <2 x float> ret <2 x float> %r @@ -421,7 +421,7 @@ define <2 x i16> @test_bitcast_2xbf16_to_2xi16(<2 x bfloat> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xbf16_to_2xi16_param_0]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <2 x bfloat> %a to <2 x i16> @@ -507,7 +507,7 @@ define <2 x bfloat> @test_fabs(<2 x bfloat> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_fabs_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fabs_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 2147450879; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/bfe.ll b/llvm/test/CodeGen/NVPTX/bfe.ll index 0392f7786731..644bf3606e8f 100644 --- a/llvm/test/CodeGen/NVPTX/bfe.ll +++ b/llvm/test/CodeGen/NVPTX/bfe.ll @@ -12,7 +12,7 @@ define i32 @bfe0(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bfe0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bfe0_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 4, 4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -27,7 +27,7 @@ define i32 @bfe1(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bfe1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bfe1_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 3, 3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -42,7 +42,7 @@ define i32 @bfe2(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bfe2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bfe2_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 5, 3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -57,7 +57,7 @@ define i32 @no_bfe_on_32bit_overflow(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [no_bfe_on_32bit_overflow_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [no_bfe_on_32bit_overflow_param_0]; ; CHECK-NEXT: shr.s32 %r2, %r1, 31; ; CHECK-NEXT: and.b32 %r3, %r2, 15; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -73,7 +73,7 @@ define i32 @no_bfe_on_32bit_overflow_shr_and_pair(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [no_bfe_on_32bit_overflow_shr_and_pair_param_0]; ; CHECK-NEXT: shr.s32 %r2, %r1, 31; ; CHECK-NEXT: and.b32 %r3, %r2, 15; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -89,7 +89,7 @@ define i64 @no_bfe_on_64bit_overflow(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [no_bfe_on_64bit_overflow_param_0]; ; CHECK-NEXT: shr.s64 %rd2, %rd1, 63; ; CHECK-NEXT: and.b64 %rd3, %rd2, 7; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; @@ -105,7 +105,7 @@ define i64 @no_bfe_on_64bit_overflow_shr_and_pair(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [no_bfe_on_64bit_overflow_shr_and_pair_param_0]; ; CHECK-NEXT: shr.s64 %rd2, %rd1, 63; ; CHECK-NEXT: and.b64 %rd3, %rd2, 7; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; @@ -121,7 +121,7 @@ define i32 @bfe_ashr_signed_32(i32 %x) { ; CHECK-O3-NEXT: .reg .b32 %r<3>; ; CHECK-O3-EMPTY: ; CHECK-O3-NEXT: // %bb.0: -; CHECK-O3-NEXT: ld.param.u16 %r1, [bfe_ashr_signed_32_param_0+2]; +; CHECK-O3-NEXT: ld.param.b16 %r1, [bfe_ashr_signed_32_param_0+2]; ; CHECK-O3-NEXT: bfe.s32 %r2, %r1, 4, 12; ; CHECK-O3-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-O3-NEXT: ret; @@ -131,7 +131,7 @@ define i32 @bfe_ashr_signed_32(i32 %x) { ; CHECK-O0-NEXT: .reg .b32 %r<3>; ; CHECK-O0-EMPTY: ; CHECK-O0-NEXT: // %bb.0: -; CHECK-O0-NEXT: ld.param.u32 %r1, [bfe_ashr_signed_32_param_0]; +; CHECK-O0-NEXT: ld.param.b32 %r1, [bfe_ashr_signed_32_param_0]; ; CHECK-O0-NEXT: bfe.s32 %r2, %r1, 20, 12; ; CHECK-O0-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-O0-NEXT: ret; @@ -146,7 +146,7 @@ define i32 @bfe_ashr_unsigned_32(i32 %x) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bfe_ashr_unsigned_32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bfe_ashr_unsigned_32_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 5, 6; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -161,7 +161,7 @@ define i64 @bfe_ashr_signed_64(i64 %x) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [bfe_ashr_signed_64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [bfe_ashr_signed_64_param_0]; ; CHECK-NEXT: bfe.s64 %rd2, %rd1, 16, 48; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; @@ -176,7 +176,7 @@ define i64 @bfe_ashr_unsigned_64(i64 %x) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [bfe_ashr_unsigned_64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [bfe_ashr_unsigned_64_param_0]; ; CHECK-NEXT: bfe.u64 %rd2, %rd1, 5, 6; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; @@ -192,7 +192,7 @@ define i32 @bfe3(i128 %a) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [bfe3_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [bfe3_param_0]; ; CHECK-NEXT: cvt.u32.u64 %r1, %rd1; ; CHECK-NEXT: bfe.s32 %r2, %r1, 15, 17; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; @@ -209,7 +209,7 @@ define i64 @bfe4(i128 %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [bfe4_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [bfe4_param_0]; ; CHECK-NEXT: bfe.s64 %rd3, %rd1, 17, 47; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/bmsk.ll b/llvm/test/CodeGen/NVPTX/bmsk.ll index ead4a42bc6c8..d5b278657bd5 100644 --- a/llvm/test/CodeGen/NVPTX/bmsk.ll +++ b/llvm/test/CodeGen/NVPTX/bmsk.ll @@ -10,8 +10,8 @@ define i32 @bmsk_wrap(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [bmsk_wrap_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [bmsk_wrap_param_1]; ; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -25,8 +25,8 @@ define i32 @bmsk_clamp(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [bmsk_clamp_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [bmsk_clamp_param_1]; ; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -54,7 +54,7 @@ define i32 @bmsk_clamp_ir(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bmsk_clamp_ir_param_0]; ; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -68,7 +68,7 @@ define i32 @bmsk_wrap_ri(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bmsk_wrap_ri_param_0]; ; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/bswap.ll b/llvm/test/CodeGen/NVPTX/bswap.ll index 0054225e6d6e..0e16682641ed 100644 --- a/llvm/test/CodeGen/NVPTX/bswap.ll +++ b/llvm/test/CodeGen/NVPTX/bswap.ll @@ -14,7 +14,7 @@ define i16 @bswap16(i16 %a) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [bswap16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [bswap16_param_0]; ; CHECK-NEXT: shr.u16 %rs2, %rs1, 8; ; CHECK-NEXT: shl.b16 %rs3, %rs1, 8; ; CHECK-NEXT: or.b16 %rs4, %rs3, %rs2; @@ -32,7 +32,7 @@ define i32 @bswap32(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bswap32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bswap32_param_0]; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 291; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -47,7 +47,7 @@ define <2 x i16> @bswapv2i16(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bswapv2i16_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [bswapv2i16_param_0]; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 8961; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -62,7 +62,7 @@ define i64 @bswap64(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [bswap64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [bswap64_param_0]; ; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {%r1, tmp}, %rd1; } ; PTX70-NEXT: prmt.b32 %r2, %r1, 0, 291; ; PTX70-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r3}, %rd1; } diff --git a/llvm/test/CodeGen/NVPTX/bug21465.ll b/llvm/test/CodeGen/NVPTX/bug21465.ll index 33c6dbddd529..79b0dbcf6494 100644 --- a/llvm/test/CodeGen/NVPTX/bug21465.ll +++ b/llvm/test/CodeGen/NVPTX/bug21465.ll @@ -15,10 +15,10 @@ entry: ; CHECK: call ptr addrspace(101) @llvm.nvvm.internal.addrspace.wrap.p101.p0(ptr %input) %b = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1 %0 = load i32, ptr %b, align 4 -; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] -; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] +; PTX-NOT: ld.param.b32 {{%r[0-9]+}}, [{{%rd[0-9]+}}] +; PTX: ld.param.b32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4] store i32 %0, ptr %output, align 4 -; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]] +; PTX-NEXT: st.global.b32 [{{%rd[0-9]+}}], [[value]] ret void } diff --git a/llvm/test/CodeGen/NVPTX/bug22246.ll b/llvm/test/CodeGen/NVPTX/bug22246.ll index 0080aafcf563..198878c1b96f 100644 --- a/llvm/test/CodeGen/NVPTX/bug22246.ll +++ b/llvm/test/CodeGen/NVPTX/bug22246.ll @@ -13,15 +13,15 @@ define void @_Z3foobbbPb(i1 zeroext %p1, i1 zeroext %p2, i1 zeroext %p3, ptr noc ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u8 %rs1, [_Z3foobbbPb_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [_Z3foobbbPb_param_0]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.u8 %rs3, [_Z3foobbbPb_param_1]; -; CHECK-NEXT: ld.param.u8 %rs4, [_Z3foobbbPb_param_2]; +; CHECK-NEXT: ld.param.b8 %rs3, [_Z3foobbbPb_param_1]; +; CHECK-NEXT: ld.param.b8 %rs4, [_Z3foobbbPb_param_2]; ; CHECK-NEXT: selp.b16 %rs5, %rs3, %rs4, %p1; ; CHECK-NEXT: and.b16 %rs6, %rs5, 1; -; CHECK-NEXT: ld.param.u64 %rd1, [_Z3foobbbPb_param_3]; -; CHECK-NEXT: st.u8 [%rd1], %rs6; +; CHECK-NEXT: ld.param.b64 %rd1, [_Z3foobbbPb_param_3]; +; CHECK-NEXT: st.b8 [%rd1], %rs6; ; CHECK-NEXT: ret; entry: %.sink.v = select i1 %p1, i1 %p2, i1 %p3 diff --git a/llvm/test/CodeGen/NVPTX/bug26185-2.ll b/llvm/test/CodeGen/NVPTX/bug26185-2.ll index c0bbf5b3559b..c4d1537557ca 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185-2.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185-2.ll @@ -15,7 +15,7 @@ define ptx_kernel void @spam(ptr addrspace(1) noalias nocapture readonly %arg, p bb: %tmp5 = add nsw i64 %arg3, 8 %tmp6 = getelementptr i16, ptr addrspace(1) %arg, i64 %tmp5 -; CHECK: ld.global.nc.u16 +; CHECK: ld.global.nc.b16 %tmp7 = load i16, ptr addrspace(1) %tmp6, align 2 ; CHECK: cvt.s32.s16 %tmp8 = sext i16 %tmp7 to i64 diff --git a/llvm/test/CodeGen/NVPTX/bug26185.ll b/llvm/test/CodeGen/NVPTX/bug26185.ll index 193df7f86ca7..3b30ce560edb 100644 --- a/llvm/test/CodeGen/NVPTX/bug26185.ll +++ b/llvm/test/CodeGen/NVPTX/bug26185.ll @@ -10,7 +10,7 @@ target triple = "nvptx64-unknown-unknown" ; CHECK-LABEL: ex_zext define ptx_kernel void @ex_zext(ptr noalias readonly %data, ptr %res) { entry: -; CHECK: ld.global.nc.u8 +; CHECK: ld.global.nc.b8 %val = load i8, ptr %data ; CHECK: cvt.u32.u8 %valext = zext i8 %val to i32 @@ -21,7 +21,7 @@ entry: ; CHECK-LABEL: ex_sext define ptx_kernel void @ex_sext(ptr noalias readonly %data, ptr %res) { entry: -; CHECK: ld.global.nc.u8 +; CHECK: ld.global.nc.b8 %val = load i8, ptr %data ; CHECK: cvt.s32.s8 %valext = sext i8 %val to i32 @@ -32,7 +32,7 @@ entry: ; CHECK-LABEL: ex_zext_v2 define ptx_kernel void @ex_zext_v2(ptr noalias readonly %data, ptr %res) { entry: -; CHECK: ld.global.nc.v2.u8 +; CHECK: ld.global.nc.v2.b8 %val = load <2 x i8>, ptr %data ; CHECK: cvt.u32.u16 %valext = zext <2 x i8> %val to <2 x i32> @@ -43,7 +43,7 @@ entry: ; CHECK-LABEL: ex_sext_v2 define ptx_kernel void @ex_sext_v2(ptr noalias readonly %data, ptr %res) { entry: -; CHECK: ld.global.nc.v2.u8 +; CHECK: ld.global.nc.v2.b8 %val = load <2 x i8>, ptr %data ; CHECK: cvt.s32.s8 %valext = sext <2 x i8> %val to <2 x i32> diff --git a/llvm/test/CodeGen/NVPTX/byval-const-global.ll b/llvm/test/CodeGen/NVPTX/byval-const-global.ll index 11964e1981e1..2af1e6d7e185 100644 --- a/llvm/test/CodeGen/NVPTX/byval-const-global.ll +++ b/llvm/test/CodeGen/NVPTX/byval-const-global.ll @@ -13,8 +13,8 @@ define void @foo() { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.u64 %rd1, [G]; -; CHECK-NEXT: ld.global.u64 %rd2, [G+8]; +; CHECK-NEXT: ld.global.b64 %rd1, [G]; +; CHECK-NEXT: ld.global.b64 %rd2, [G+8]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[16]; ; CHECK-NEXT: st.param.b64 [param0], %rd1; diff --git a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll index 9474b01f95ee..c4a62f9f8c50 100644 --- a/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll +++ b/llvm/test/CodeGen/NVPTX/call-with-alloca-buffer.ll @@ -23,11 +23,11 @@ entry: ; CHECK: .local .align 4 .b8 __local_depot0[16] ; CHECK: mov.b64 %SPL -; CHECK: ld.param.u64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] +; CHECK: ld.param.b64 %rd[[A_REG:[0-9]+]], [kernel_func_param_0] ; CHECK: cvta.to.global.u64 %rd[[A1_REG:[0-9]+]], %rd[[A_REG]] ; CHECK: add.u64 %rd[[SP_REG:[0-9]+]], %SP, 0 -; CHECK: ld.global.f32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]] -; CHECK: st.local.f32 [{{%rd[0-9]+}}], %f[[A0_REG]] +; CHECK: ld.global.b32 %f[[A0_REG:[0-9]+]], [%rd[[A1_REG]]] +; CHECK: st.local.b32 [{{%rd[0-9]+}}], %f[[A0_REG]] %0 = load float, ptr %a, align 4 store float %0, ptr %buf, align 4 diff --git a/llvm/test/CodeGen/NVPTX/chain-different-as.ll b/llvm/test/CodeGen/NVPTX/chain-different-as.ll index 704ed234f7fe..f2d0d9d069ea 100644 --- a/llvm/test/CodeGen/NVPTX/chain-different-as.ll +++ b/llvm/test/CodeGen/NVPTX/chain-different-as.ll @@ -9,8 +9,8 @@ define i64 @test() nounwind readnone { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %rd1, 1; ; CHECK-NEXT: mov.b64 %rd2, 42; -; CHECK-NEXT: st.u64 [%rd1], %rd2; -; CHECK-NEXT: ld.global.u64 %rd3, [%rd1]; +; CHECK-NEXT: st.b64 [%rd1], %rd2; +; CHECK-NEXT: ld.global.b64 %rd3, [%rd1]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; %addr0 = inttoptr i64 1 to ptr diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll index 442da4debea8..65a077d67e4b 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm60.ll @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -192,8 +192,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -204,9 +204,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -238,8 +238,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -250,9 +250,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -284,8 +284,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -297,9 +297,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -331,8 +331,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -344,9 +344,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -378,8 +378,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -391,9 +391,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -425,8 +425,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -437,9 +437,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -471,8 +471,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -483,9 +483,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -517,8 +517,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -529,9 +529,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -563,8 +563,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -575,9 +575,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -609,8 +609,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -621,9 +621,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -655,8 +655,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; ; SM60-NEXT: and.b32 %r10, %r9, 3; @@ -667,9 +667,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -701,8 +701,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -714,9 +714,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -748,8 +748,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -761,9 +761,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -795,8 +795,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -808,9 +808,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -842,8 +842,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -855,9 +855,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -888,8 +888,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -901,9 +901,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -934,8 +934,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -947,9 +947,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -980,8 +980,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -993,9 +993,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1027,8 +1027,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1040,9 +1040,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1074,8 +1074,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1087,9 +1087,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1121,8 +1121,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,9 +1134,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1168,8 +1168,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1181,9 +1181,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1215,8 +1215,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1228,9 +1228,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1262,8 +1262,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1275,9 +1275,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1309,8 +1309,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1322,9 +1322,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1356,8 +1356,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1369,9 +1369,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1403,8 +1403,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1416,9 +1416,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1450,8 +1450,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1463,9 +1463,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1497,8 +1497,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1510,9 +1510,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1544,8 +1544,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1557,9 +1557,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1591,8 +1591,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1604,9 +1604,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1638,8 +1638,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1651,9 +1651,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1685,8 +1685,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1698,9 +1698,9 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1732,8 +1732,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1745,9 +1745,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1779,8 +1779,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1792,9 +1792,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1826,8 +1826,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1839,9 +1839,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1873,8 +1873,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1886,9 +1886,9 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1920,8 +1920,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1933,9 +1933,9 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1967,8 +1967,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1980,9 +1980,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.u32 %r16, [%rd1]; +; SM60-NEXT: ld.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2014,8 +2014,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2027,9 +2027,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.global.u32 %r16, [%rd1]; +; SM60-NEXT: ld.global.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2061,8 +2061,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM60-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM60-NEXT: membar.sys; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2074,9 +2074,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM60-NEXT: cvt.u32.u16 %r13, %rs1; ; SM60-NEXT: and.b32 %r14, %r13, 255; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; -; SM60-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM60-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; ; SM60-NEXT: shl.b32 %r4, %r15, %r1; -; SM60-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM60-NEXT: and.b32 %r20, %r16, %r2; ; SM60-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2108,10 +2108,10 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2121,7 +2121,7 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2152,10 +2152,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2165,7 +2165,7 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2196,10 +2196,10 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2209,7 +2209,7 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2240,10 +2240,10 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2253,7 +2253,7 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2285,10 +2285,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2298,7 +2298,7 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2330,10 +2330,10 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2343,7 +2343,7 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2375,10 +2375,10 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2389,7 +2389,7 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2421,10 +2421,10 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2435,7 +2435,7 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2467,10 +2467,10 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2481,7 +2481,7 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2513,10 +2513,10 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2526,7 +2526,7 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2558,10 +2558,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2571,7 +2571,7 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2603,10 +2603,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2616,7 +2616,7 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2648,10 +2648,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2661,7 +2661,7 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2693,10 +2693,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2706,7 +2706,7 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2738,10 +2738,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; -; SM60-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; ; SM60-NEXT: shl.b32 %r1, %r11, 3; @@ -2751,7 +2751,7 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2783,10 +2783,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2797,7 +2797,7 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2829,10 +2829,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2843,7 +2843,7 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2875,10 +2875,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2889,7 +2889,7 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2921,10 +2921,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2935,7 +2935,7 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2966,10 +2966,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -2980,7 +2980,7 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3011,10 +3011,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3025,7 +3025,7 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3056,10 +3056,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3070,7 +3070,7 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3102,10 +3102,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3116,7 +3116,7 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3148,10 +3148,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3162,7 +3162,7 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3194,10 +3194,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3208,7 +3208,7 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3240,10 +3240,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3254,7 +3254,7 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3286,10 +3286,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3300,7 +3300,7 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3332,10 +3332,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3346,7 +3346,7 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3378,10 +3378,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3392,7 +3392,7 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3424,10 +3424,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3438,7 +3438,7 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3470,10 +3470,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3484,7 +3484,7 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3516,10 +3516,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3530,7 +3530,7 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3562,10 +3562,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3576,7 +3576,7 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3608,10 +3608,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3622,7 +3622,7 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3654,10 +3654,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3668,7 +3668,7 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3700,10 +3700,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3714,7 +3714,7 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3746,10 +3746,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3760,7 +3760,7 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3792,10 +3792,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3806,7 +3806,7 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3838,10 +3838,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3852,7 +3852,7 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3884,10 +3884,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3898,7 +3898,7 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3930,10 +3930,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3944,7 +3944,7 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3976,10 +3976,10 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -3990,7 +3990,7 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4022,10 +4022,10 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -4036,7 +4036,7 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.u32 %r15, [%rd1]; +; SM60-NEXT: ld.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4068,10 +4068,10 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -4082,7 +4082,7 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.global.u32 %r15, [%rd1]; +; SM60-NEXT: ld.global.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4114,10 +4114,10 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: .reg .b64 %rd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM60-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM60-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; ; SM60-NEXT: and.b64 %rd1, %rd2, -4; ; SM60-NEXT: cvt.u32.u64 %r10, %rd2; ; SM60-NEXT: and.b32 %r11, %r10, 3; @@ -4128,7 +4128,7 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM60-NEXT: cvt.u32.u16 %r14, %rs1; ; SM60-NEXT: shl.b32 %r3, %r14, %r1; ; SM60-NEXT: shl.b32 %r4, %r9, %r1; -; SM60-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM60-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM60-NEXT: and.b32 %r19, %r15, %r2; ; SM60-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM60-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4158,9 +4158,9 @@ define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4175,9 +4175,9 @@ define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4192,9 +4192,9 @@ define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4209,9 +4209,9 @@ define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4226,9 +4226,9 @@ define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4243,9 +4243,9 @@ define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4260,10 +4260,10 @@ define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4278,10 +4278,10 @@ define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4296,10 +4296,10 @@ define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4314,9 +4314,9 @@ define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4331,9 +4331,9 @@ define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4348,9 +4348,9 @@ define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4365,9 +4365,9 @@ define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4382,9 +4382,9 @@ define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4399,9 +4399,9 @@ define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4416,10 +4416,10 @@ define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4434,10 +4434,10 @@ define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4452,10 +4452,10 @@ define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4470,9 +4470,9 @@ define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4487,9 +4487,9 @@ define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4504,9 +4504,9 @@ define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4521,9 +4521,9 @@ define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4538,9 +4538,9 @@ define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4555,9 +4555,9 @@ define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4572,10 +4572,10 @@ define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4590,10 +4590,10 @@ define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4608,10 +4608,10 @@ define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4626,9 +4626,9 @@ define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4643,9 +4643,9 @@ define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4660,9 +4660,9 @@ define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4677,9 +4677,9 @@ define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4694,9 +4694,9 @@ define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4711,9 +4711,9 @@ define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4728,10 +4728,10 @@ define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4746,10 +4746,10 @@ define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4764,10 +4764,10 @@ define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4782,10 +4782,10 @@ define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4800,10 +4800,10 @@ define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4818,10 +4818,10 @@ define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4836,10 +4836,10 @@ define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4854,10 +4854,10 @@ define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4872,10 +4872,10 @@ define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4890,10 +4890,10 @@ define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; ; SM60-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4908,10 +4908,10 @@ define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; ; SM60-NEXT: atom.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4926,10 +4926,10 @@ define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM60-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM60-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM60-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM60-NEXT: st.param.b32 [func_retval0], %r2; ; SM60-NEXT: ret; @@ -4943,9 +4943,9 @@ define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -4959,9 +4959,9 @@ define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -4975,9 +4975,9 @@ define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -4991,9 +4991,9 @@ define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5007,9 +5007,9 @@ define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5023,9 +5023,9 @@ define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5039,10 +5039,10 @@ define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5056,10 +5056,10 @@ define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5073,10 +5073,10 @@ define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5090,9 +5090,9 @@ define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5106,9 +5106,9 @@ define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5122,9 +5122,9 @@ define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5138,9 +5138,9 @@ define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5154,9 +5154,9 @@ define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5170,9 +5170,9 @@ define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5186,10 +5186,10 @@ define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5203,10 +5203,10 @@ define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5220,10 +5220,10 @@ define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5237,9 +5237,9 @@ define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5253,9 +5253,9 @@ define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5269,9 +5269,9 @@ define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5285,9 +5285,9 @@ define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5301,9 +5301,9 @@ define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5317,9 +5317,9 @@ define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5333,10 +5333,10 @@ define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5350,10 +5350,10 @@ define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5367,10 +5367,10 @@ define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5384,9 +5384,9 @@ define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5400,9 +5400,9 @@ define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5416,9 +5416,9 @@ define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5432,9 +5432,9 @@ define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5448,9 +5448,9 @@ define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5464,9 +5464,9 @@ define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5480,10 +5480,10 @@ define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5497,10 +5497,10 @@ define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5514,10 +5514,10 @@ define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5531,10 +5531,10 @@ define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5548,10 +5548,10 @@ define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5565,10 +5565,10 @@ define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5582,10 +5582,10 @@ define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5599,10 +5599,10 @@ define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5616,10 +5616,10 @@ define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5633,10 +5633,10 @@ define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; ; SM60-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5650,10 +5650,10 @@ define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; ; SM60-NEXT: atom.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; @@ -5667,10 +5667,10 @@ define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM60-NEXT: .reg .b64 %rd<5>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM60-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; ; SM60-NEXT: membar.sys; -; SM60-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM60-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM60-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM60-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; ; SM60-NEXT: atom.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM60-NEXT: st.param.b64 [func_retval0], %rd3; ; SM60-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll index df8c49aaaa42..7107fbcf6eb5 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm70.ll @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -192,8 +192,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -204,9 +204,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -238,8 +238,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -250,9 +250,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -284,8 +284,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -297,9 +297,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -331,8 +331,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -344,9 +344,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -378,8 +378,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -391,9 +391,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -425,8 +425,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -437,9 +437,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -471,8 +471,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -483,9 +483,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -517,8 +517,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -529,9 +529,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -563,8 +563,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -575,9 +575,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -609,8 +609,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -621,9 +621,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -655,8 +655,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -667,9 +667,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -701,8 +701,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -714,9 +714,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -748,8 +748,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -761,9 +761,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -795,8 +795,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -808,9 +808,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -842,8 +842,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -855,9 +855,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -888,8 +888,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -901,9 +901,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -934,8 +934,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -947,9 +947,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -980,8 +980,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -993,9 +993,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1027,8 +1027,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1040,9 +1040,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1074,8 +1074,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1087,9 +1087,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1121,8 +1121,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,9 +1134,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1168,8 +1168,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1181,9 +1181,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1215,8 +1215,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1228,9 +1228,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1262,8 +1262,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1275,9 +1275,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1309,8 +1309,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1322,9 +1322,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1356,8 +1356,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1369,9 +1369,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1403,8 +1403,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1416,9 +1416,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1450,8 +1450,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1463,9 +1463,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1497,8 +1497,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1510,9 +1510,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1544,8 +1544,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1557,9 +1557,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1591,8 +1591,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1604,9 +1604,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1638,8 +1638,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1651,9 +1651,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1685,8 +1685,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1698,9 +1698,9 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1732,8 +1732,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1745,9 +1745,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1779,8 +1779,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1792,9 +1792,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1826,8 +1826,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1839,9 +1839,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1873,8 +1873,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1886,9 +1886,9 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1920,8 +1920,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1933,9 +1933,9 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1967,8 +1967,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1980,9 +1980,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2014,8 +2014,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2027,9 +2027,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.global.u32 %r16, [%rd1]; +; SM70-NEXT: ld.global.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2061,8 +2061,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2074,9 +2074,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2108,10 +2108,10 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2121,7 +2121,7 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2152,10 +2152,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2165,7 +2165,7 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2196,10 +2196,10 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2209,7 +2209,7 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2240,10 +2240,10 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2253,7 +2253,7 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2285,10 +2285,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2298,7 +2298,7 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2330,10 +2330,10 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2343,7 +2343,7 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2375,10 +2375,10 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2389,7 +2389,7 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2421,10 +2421,10 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2435,7 +2435,7 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2467,10 +2467,10 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2481,7 +2481,7 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2513,10 +2513,10 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2526,7 +2526,7 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2558,10 +2558,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2571,7 +2571,7 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2603,10 +2603,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2616,7 +2616,7 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2648,10 +2648,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2661,7 +2661,7 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2693,10 +2693,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2706,7 +2706,7 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2738,10 +2738,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -2751,7 +2751,7 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2783,10 +2783,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2797,7 +2797,7 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2829,10 +2829,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2843,7 +2843,7 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2875,10 +2875,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2889,7 +2889,7 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2921,10 +2921,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2935,7 +2935,7 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2966,10 +2966,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -2980,7 +2980,7 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3011,10 +3011,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3025,7 +3025,7 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3056,10 +3056,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3070,7 +3070,7 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3102,10 +3102,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3116,7 +3116,7 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3148,10 +3148,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3162,7 +3162,7 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3194,10 +3194,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3208,7 +3208,7 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3240,10 +3240,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3254,7 +3254,7 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3286,10 +3286,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3300,7 +3300,7 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3332,10 +3332,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3346,7 +3346,7 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3378,10 +3378,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3392,7 +3392,7 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3424,10 +3424,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3438,7 +3438,7 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3470,10 +3470,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3484,7 +3484,7 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3516,10 +3516,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3530,7 +3530,7 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3562,10 +3562,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3576,7 +3576,7 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3608,10 +3608,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3622,7 +3622,7 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3654,10 +3654,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3668,7 +3668,7 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3700,10 +3700,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3714,7 +3714,7 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3746,10 +3746,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3760,7 +3760,7 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3792,10 +3792,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3806,7 +3806,7 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3838,10 +3838,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3852,7 +3852,7 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3884,10 +3884,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3898,7 +3898,7 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3930,10 +3930,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3944,7 +3944,7 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3976,10 +3976,10 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -3990,7 +3990,7 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4022,10 +4022,10 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -4036,7 +4036,7 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4068,10 +4068,10 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -4082,7 +4082,7 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.global.u32 %r15, [%rd1]; +; SM70-NEXT: ld.global.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4114,10 +4114,10 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -4128,7 +4128,7 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM70-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4158,9 +4158,9 @@ define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; ; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4175,9 +4175,9 @@ define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; ; SM70-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4192,9 +4192,9 @@ define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; ; SM70-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4209,9 +4209,9 @@ define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4226,9 +4226,9 @@ define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4243,9 +4243,9 @@ define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4260,10 +4260,10 @@ define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4278,10 +4278,10 @@ define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4296,10 +4296,10 @@ define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4314,9 +4314,9 @@ define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4331,9 +4331,9 @@ define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4348,9 +4348,9 @@ define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4365,9 +4365,9 @@ define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4382,9 +4382,9 @@ define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4399,9 +4399,9 @@ define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4416,10 +4416,10 @@ define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4434,10 +4434,10 @@ define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4452,10 +4452,10 @@ define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4470,9 +4470,9 @@ define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; ; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4487,9 +4487,9 @@ define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; ; SM70-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4504,9 +4504,9 @@ define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; ; SM70-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4521,9 +4521,9 @@ define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4538,9 +4538,9 @@ define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; ; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4555,9 +4555,9 @@ define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; ; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4572,10 +4572,10 @@ define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4590,10 +4590,10 @@ define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4608,10 +4608,10 @@ define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4626,9 +4626,9 @@ define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4643,9 +4643,9 @@ define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; ; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4660,9 +4660,9 @@ define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; ; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4677,9 +4677,9 @@ define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4694,9 +4694,9 @@ define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; ; SM70-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4711,9 +4711,9 @@ define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; ; SM70-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4728,10 +4728,10 @@ define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4746,10 +4746,10 @@ define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4764,10 +4764,10 @@ define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4782,10 +4782,10 @@ define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4800,10 +4800,10 @@ define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4818,10 +4818,10 @@ define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4836,10 +4836,10 @@ define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4854,10 +4854,10 @@ define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4872,10 +4872,10 @@ define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4890,10 +4890,10 @@ define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4908,10 +4908,10 @@ define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4926,10 +4926,10 @@ define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -4943,9 +4943,9 @@ define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; ; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -4959,9 +4959,9 @@ define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; ; SM70-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -4975,9 +4975,9 @@ define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; ; SM70-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -4991,9 +4991,9 @@ define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5007,9 +5007,9 @@ define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5023,9 +5023,9 @@ define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5039,10 +5039,10 @@ define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5056,10 +5056,10 @@ define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5073,10 +5073,10 @@ define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5090,9 +5090,9 @@ define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5106,9 +5106,9 @@ define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5122,9 +5122,9 @@ define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5138,9 +5138,9 @@ define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5154,9 +5154,9 @@ define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5170,9 +5170,9 @@ define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5186,10 +5186,10 @@ define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5203,10 +5203,10 @@ define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5220,10 +5220,10 @@ define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5237,9 +5237,9 @@ define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; ; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5253,9 +5253,9 @@ define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; ; SM70-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5269,9 +5269,9 @@ define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; ; SM70-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5285,9 +5285,9 @@ define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5301,9 +5301,9 @@ define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; ; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5317,9 +5317,9 @@ define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; ; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5333,10 +5333,10 @@ define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5350,10 +5350,10 @@ define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5367,10 +5367,10 @@ define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5384,9 +5384,9 @@ define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5400,9 +5400,9 @@ define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; ; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5416,9 +5416,9 @@ define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; ; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5432,9 +5432,9 @@ define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5448,9 +5448,9 @@ define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; ; SM70-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5464,9 +5464,9 @@ define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; ; SM70-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5480,10 +5480,10 @@ define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5497,10 +5497,10 @@ define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5514,10 +5514,10 @@ define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5531,10 +5531,10 @@ define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5548,10 +5548,10 @@ define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5565,10 +5565,10 @@ define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5582,10 +5582,10 @@ define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5599,10 +5599,10 @@ define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5616,10 +5616,10 @@ define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5633,10 +5633,10 @@ define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5650,10 +5650,10 @@ define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; ; SM70-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -5667,10 +5667,10 @@ define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; ; SM70-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll index 6df7b3d695f7..f289c3cf3d50 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg-sm90.ll @@ -11,8 +11,8 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -23,9 +23,9 @@ define i8 @monotonic_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -56,8 +56,8 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -68,9 +68,9 @@ define i8 @monotonic_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -101,8 +101,8 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -113,9 +113,9 @@ define i8 @monotonic_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %ne ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -146,8 +146,8 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -158,9 +158,9 @@ define i8 @monotonic_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -192,8 +192,8 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -204,9 +204,9 @@ define i8 @monotonic_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -238,8 +238,8 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -250,9 +250,9 @@ define i8 @monotonic_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -284,8 +284,8 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -297,9 +297,9 @@ define i8 @monotonic_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -331,8 +331,8 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -344,9 +344,9 @@ define i8 @monotonic_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -378,8 +378,8 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [monotonic_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -391,9 +391,9 @@ define i8 @monotonic_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [monotonic_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [monotonic_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -425,8 +425,8 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -437,9 +437,9 @@ define i8 @acquire_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -471,8 +471,8 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -483,9 +483,9 @@ define i8 @acquire_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB10_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -517,8 +517,8 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -529,9 +529,9 @@ define i8 @acquire_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB11_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -563,8 +563,8 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -575,9 +575,9 @@ define i8 @acquire_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB12_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -609,8 +609,8 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -621,9 +621,9 @@ define i8 @acquire_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB13_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -655,8 +655,8 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i8_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -667,9 +667,9 @@ define i8 @acquire_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB14_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -701,8 +701,8 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -714,9 +714,9 @@ define i8 @acquire_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB15_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -748,8 +748,8 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -761,9 +761,9 @@ define i8 @acquire_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB16_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -795,8 +795,8 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -808,9 +808,9 @@ define i8 @acquire_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB17_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -842,8 +842,8 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -855,9 +855,9 @@ define i8 @release_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB18_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -888,8 +888,8 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -901,9 +901,9 @@ define i8 @release_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB19_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -934,8 +934,8 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -947,9 +947,9 @@ define i8 @release_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB20_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -980,8 +980,8 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -993,9 +993,9 @@ define i8 @release_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB21_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1027,8 +1027,8 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1040,9 +1040,9 @@ define i8 @release_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB22_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1074,8 +1074,8 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1087,9 +1087,9 @@ define i8 @release_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB23_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1121,8 +1121,8 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1134,9 +1134,9 @@ define i8 @release_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB24_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1168,8 +1168,8 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1181,9 +1181,9 @@ define i8 @release_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB25_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1215,8 +1215,8 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1228,9 +1228,9 @@ define i8 @release_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB26_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1262,8 +1262,8 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1275,9 +1275,9 @@ define i8 @acq_rel_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB27_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1309,8 +1309,8 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1322,9 +1322,9 @@ define i8 @acq_rel_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB28_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1356,8 +1356,8 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1369,9 +1369,9 @@ define i8 @acq_rel_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB29_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1403,8 +1403,8 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_generic_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1416,9 +1416,9 @@ define i8 @acq_rel_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB30_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1450,8 +1450,8 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_global_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1463,9 +1463,9 @@ define i8 @acq_rel_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB31_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1497,8 +1497,8 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i8_shared_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1510,9 +1510,9 @@ define i8 @acq_rel_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB32_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1544,8 +1544,8 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1557,9 +1557,9 @@ define i8 @acq_rel_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB33_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1591,8 +1591,8 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1604,9 +1604,9 @@ define i8 @acq_rel_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB34_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1638,8 +1638,8 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1651,9 +1651,9 @@ define i8 @acq_rel_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB35_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1685,8 +1685,8 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1698,9 +1698,9 @@ define i8 @seq_cst_monotonic_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB36_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1732,8 +1732,8 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1745,9 +1745,9 @@ define i8 @seq_cst_monotonic_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB37_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1779,8 +1779,8 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_monotonic_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1792,9 +1792,9 @@ define i8 @seq_cst_monotonic_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_monotonic_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_monotonic_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB38_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1826,8 +1826,8 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1839,9 +1839,9 @@ define i8 @seq_cst_acquire_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB39_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1873,8 +1873,8 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1886,9 +1886,9 @@ define i8 @seq_cst_acquire_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB40_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1920,8 +1920,8 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_acquire_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_acquire_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1933,9 +1933,9 @@ define i8 @seq_cst_acquire_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_acquire_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_acquire_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB41_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1967,8 +1967,8 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_generic_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -1980,9 +1980,9 @@ define i8 @seq_cst_seq_cst_i8_generic(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_generic_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB42_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2014,8 +2014,8 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_global_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2027,9 +2027,9 @@ define i8 @seq_cst_seq_cst_i8_global(ptr addrspace(1) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_global_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_global_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.global.u32 %r16, [%rd1]; +; SM90-NEXT: ld.global.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB43_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2061,8 +2061,8 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_seq_cst_i8_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i8_shared_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -2074,9 +2074,9 @@ define i8 @seq_cst_seq_cst_i8_shared(ptr addrspace(3) %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_seq_cst_i8_shared_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.shared.u32 %r16, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB44_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2108,10 +2108,10 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_generic_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2121,7 +2121,7 @@ define i16 @monotonic_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB45_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2152,10 +2152,10 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2165,7 +2165,7 @@ define i16 @monotonic_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB46_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2196,10 +2196,10 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i16_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_monotonic_i16_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2209,7 +2209,7 @@ define i16 @monotonic_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB47_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2240,10 +2240,10 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_generic_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2253,7 +2253,7 @@ define i16 @monotonic_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB48_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2285,10 +2285,10 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2298,7 +2298,7 @@ define i16 @monotonic_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB49_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2330,10 +2330,10 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i16_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_acquire_i16_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2343,7 +2343,7 @@ define i16 @monotonic_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB50_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2375,10 +2375,10 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2389,7 +2389,7 @@ define i16 @monotonic_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB51_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2421,10 +2421,10 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2435,7 +2435,7 @@ define i16 @monotonic_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB52_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2467,10 +2467,10 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [monotonic_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [monotonic_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [monotonic_seq_cst_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2481,7 +2481,7 @@ define i16 @monotonic_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB53_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2513,10 +2513,10 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_generic_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2526,7 +2526,7 @@ define i16 @acquire_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB54_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2558,10 +2558,10 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2571,7 +2571,7 @@ define i16 @acquire_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB55_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2603,10 +2603,10 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i16_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_monotonic_i16_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2616,7 +2616,7 @@ define i16 @acquire_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB56_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2648,10 +2648,10 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_generic_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_generic_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2661,7 +2661,7 @@ define i16 @acquire_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB57_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2693,10 +2693,10 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_global_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_global_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2706,7 +2706,7 @@ define i16 @acquire_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB58_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2738,10 +2738,10 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i16_shared_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_acquire_i16_shared_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -2751,7 +2751,7 @@ define i16 @acquire_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB59_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2783,10 +2783,10 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2797,7 +2797,7 @@ define i16 @acquire_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB60_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2829,10 +2829,10 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2843,7 +2843,7 @@ define i16 @acquire_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB61_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2875,10 +2875,10 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acquire_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_seq_cst_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2889,7 +2889,7 @@ define i16 @acquire_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB62_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2921,10 +2921,10 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_generic_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2935,7 +2935,7 @@ define i16 @release_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB63_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -2966,10 +2966,10 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_global_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -2980,7 +2980,7 @@ define i16 @release_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB64_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3011,10 +3011,10 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i16_shared_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_monotonic_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3025,7 +3025,7 @@ define i16 @release_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB65_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3056,10 +3056,10 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_generic_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3070,7 +3070,7 @@ define i16 @release_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB66_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3102,10 +3102,10 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_global_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3116,7 +3116,7 @@ define i16 @release_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB67_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3148,10 +3148,10 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i16_shared_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_acquire_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3162,7 +3162,7 @@ define i16 @release_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB68_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3194,10 +3194,10 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3208,7 +3208,7 @@ define i16 @release_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB69_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3240,10 +3240,10 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3254,7 +3254,7 @@ define i16 @release_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB70_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3286,10 +3286,10 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_seq_cst_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3300,7 +3300,7 @@ define i16 @release_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB71_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3332,10 +3332,10 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_generic_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3346,7 +3346,7 @@ define i16 @acq_rel_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB72_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3378,10 +3378,10 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_global_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3392,7 +3392,7 @@ define i16 @acq_rel_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB73_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3424,10 +3424,10 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i16_shared_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_monotonic_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3438,7 +3438,7 @@ define i16 @acq_rel_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB74_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3470,10 +3470,10 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_generic_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3484,7 +3484,7 @@ define i16 @acq_rel_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB75_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3516,10 +3516,10 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_global_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3530,7 +3530,7 @@ define i16 @acq_rel_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB76_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3562,10 +3562,10 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i16_shared_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_acquire_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3576,7 +3576,7 @@ define i16 @acq_rel_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB77_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3608,10 +3608,10 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3622,7 +3622,7 @@ define i16 @acq_rel_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB78_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3654,10 +3654,10 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3668,7 +3668,7 @@ define i16 @acq_rel_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB79_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3700,10 +3700,10 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_seq_cst_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3714,7 +3714,7 @@ define i16 @acq_rel_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB80_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3746,10 +3746,10 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3760,7 +3760,7 @@ define i16 @seq_cst_monotonic_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB81_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3792,10 +3792,10 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3806,7 +3806,7 @@ define i16 @seq_cst_monotonic_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB82_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3838,10 +3838,10 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_monotonic_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_monotonic_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_monotonic_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3852,7 +3852,7 @@ define i16 @seq_cst_monotonic_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 % ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB83_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3884,10 +3884,10 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3898,7 +3898,7 @@ define i16 @seq_cst_acquire_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB84_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3930,10 +3930,10 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3944,7 +3944,7 @@ define i16 @seq_cst_acquire_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB85_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -3976,10 +3976,10 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_acquire_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_acquire_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_acquire_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_acquire_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -3990,7 +3990,7 @@ define i16 @seq_cst_acquire_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB86_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4022,10 +4022,10 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_generic_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -4036,7 +4036,7 @@ define i16 @seq_cst_seq_cst_i16_generic(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB87_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4068,10 +4068,10 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_global_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_global_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -4082,7 +4082,7 @@ define i16 @seq_cst_seq_cst_i16_global(ptr addrspace(1) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.global.u32 %r15, [%rd1]; +; SM90-NEXT: ld.global.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB88_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4114,10 +4114,10 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_seq_cst_i16_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i16_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_seq_cst_i16_shared_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -4128,7 +4128,7 @@ define i16 @seq_cst_seq_cst_i16_shared(ptr addrspace(3) %addr, i16 %cmp, i16 %ne ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.shared.u32 %r15, [%rd1]; +; SM90-NEXT: ld.shared.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB89_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -4158,9 +4158,9 @@ define i32 @monotonic_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_generic_param_2]; ; SM90-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4175,9 +4175,9 @@ define i32 @monotonic_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_global_param_2]; ; SM90-NEXT: atom.relaxed.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4192,9 +4192,9 @@ define i32 @monotonic_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_monotonic_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_monotonic_i32_shared_param_2]; ; SM90-NEXT: atom.relaxed.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4209,9 +4209,9 @@ define i32 @monotonic_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4226,9 +4226,9 @@ define i32 @monotonic_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4243,9 +4243,9 @@ define i32 @monotonic_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_acquire_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_acquire_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4260,10 +4260,10 @@ define i32 @monotonic_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4278,10 +4278,10 @@ define i32 @monotonic_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4296,10 +4296,10 @@ define i32 @monotonic_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [monotonic_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [monotonic_seq_cst_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [monotonic_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [monotonic_seq_cst_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4314,9 +4314,9 @@ define i32 @acquire_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4331,9 +4331,9 @@ define i32 @acquire_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4348,9 +4348,9 @@ define i32 @acquire_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_monotonic_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_monotonic_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4365,9 +4365,9 @@ define i32 @acquire_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4382,9 +4382,9 @@ define i32 @acquire_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4399,9 +4399,9 @@ define i32 @acquire_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_acquire_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_acquire_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4416,10 +4416,10 @@ define i32 @acquire_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4434,10 +4434,10 @@ define i32 @acquire_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4452,10 +4452,10 @@ define i32 @acquire_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acquire_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_seq_cst_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_seq_cst_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4470,9 +4470,9 @@ define i32 @release_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_generic_param_2]; ; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4487,9 +4487,9 @@ define i32 @release_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_global_param_2]; ; SM90-NEXT: atom.release.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4504,9 +4504,9 @@ define i32 @release_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_monotonic_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_monotonic_i32_shared_param_2]; ; SM90-NEXT: atom.release.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4521,9 +4521,9 @@ define i32 @release_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_generic_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4538,9 +4538,9 @@ define i32 @release_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_global_param_2]; ; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4555,9 +4555,9 @@ define i32 @release_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_acquire_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_acquire_i32_shared_param_2]; ; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4572,10 +4572,10 @@ define i32 @release_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4590,10 +4590,10 @@ define i32 @release_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4608,10 +4608,10 @@ define i32 @release_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [release_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_seq_cst_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [release_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_seq_cst_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4626,9 +4626,9 @@ define i32 @acq_rel_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_generic_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4643,9 +4643,9 @@ define i32 @acq_rel_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_global_param_2]; ; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4660,9 +4660,9 @@ define i32 @acq_rel_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_monotonic_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_monotonic_i32_shared_param_2]; ; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4677,9 +4677,9 @@ define i32 @acq_rel_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_generic_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_generic_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4694,9 +4694,9 @@ define i32 @acq_rel_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_global_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_global_param_2]; ; SM90-NEXT: atom.acq_rel.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4711,9 +4711,9 @@ define i32 @acq_rel_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i32_shared_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_acquire_i32_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_acquire_i32_shared_param_2]; ; SM90-NEXT: atom.acq_rel.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4728,10 +4728,10 @@ define i32 @acq_rel_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4746,10 +4746,10 @@ define i32 @acq_rel_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4764,10 +4764,10 @@ define i32 @acq_rel_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_seq_cst_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4782,10 +4782,10 @@ define i32 @seq_cst_monotonic_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4800,10 +4800,10 @@ define i32 @seq_cst_monotonic_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4818,10 +4818,10 @@ define i32 @seq_cst_monotonic_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 % ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_monotonic_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_monotonic_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_monotonic_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_monotonic_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4836,10 +4836,10 @@ define i32 @seq_cst_acquire_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4854,10 +4854,10 @@ define i32 @seq_cst_acquire_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4872,10 +4872,10 @@ define i32 @seq_cst_acquire_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_acquire_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_acquire_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_acquire_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_acquire_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4890,10 +4890,10 @@ define i32 @seq_cst_seq_cst_i32_generic(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_generic_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4908,10 +4908,10 @@ define i32 @seq_cst_seq_cst_i32_global(ptr addrspace(1) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_global_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_global_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_global_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4926,10 +4926,10 @@ define i32 @seq_cst_seq_cst_i32_shared(ptr addrspace(3) %addr, i32 %cmp, i32 %ne ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i32_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_seq_cst_i32_shared_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_seq_cst_i32_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -4943,9 +4943,9 @@ define i64 @monotonic_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_generic_param_2]; ; SM90-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -4959,9 +4959,9 @@ define i64 @monotonic_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_global_param_2]; ; SM90-NEXT: atom.relaxed.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -4975,9 +4975,9 @@ define i64 @monotonic_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_monotonic_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_monotonic_i64_shared_param_2]; ; SM90-NEXT: atom.relaxed.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -4991,9 +4991,9 @@ define i64 @monotonic_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5007,9 +5007,9 @@ define i64 @monotonic_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5023,9 +5023,9 @@ define i64 @monotonic_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_acquire_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_acquire_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5039,10 +5039,10 @@ define i64 @monotonic_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5056,10 +5056,10 @@ define i64 @monotonic_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5073,10 +5073,10 @@ define i64 @monotonic_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [monotonic_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [monotonic_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [monotonic_seq_cst_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5090,9 +5090,9 @@ define i64 @acquire_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5106,9 +5106,9 @@ define i64 @acquire_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5122,9 +5122,9 @@ define i64 @acquire_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_monotonic_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_monotonic_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5138,9 +5138,9 @@ define i64 @acquire_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5154,9 +5154,9 @@ define i64 @acquire_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5170,9 +5170,9 @@ define i64 @acquire_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_acquire_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_acquire_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5186,10 +5186,10 @@ define i64 @acquire_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5203,10 +5203,10 @@ define i64 @acquire_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5220,10 +5220,10 @@ define i64 @acquire_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_seq_cst_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_seq_cst_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5237,9 +5237,9 @@ define i64 @release_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_generic_param_2]; ; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5253,9 +5253,9 @@ define i64 @release_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_global_param_2]; ; SM90-NEXT: atom.release.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5269,9 +5269,9 @@ define i64 @release_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_monotonic_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_monotonic_i64_shared_param_2]; ; SM90-NEXT: atom.release.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5285,9 +5285,9 @@ define i64 @release_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_generic_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5301,9 +5301,9 @@ define i64 @release_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_global_param_2]; ; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5317,9 +5317,9 @@ define i64 @release_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_acquire_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_acquire_i64_shared_param_2]; ; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5333,10 +5333,10 @@ define i64 @release_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5350,10 +5350,10 @@ define i64 @release_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5367,10 +5367,10 @@ define i64 @release_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [release_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [release_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_seq_cst_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_seq_cst_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5384,9 +5384,9 @@ define i64 @acq_rel_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_generic_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5400,9 +5400,9 @@ define i64 @acq_rel_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_global_param_2]; ; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5416,9 +5416,9 @@ define i64 @acq_rel_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_monotonic_i64_shared_param_2]; ; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5432,9 +5432,9 @@ define i64 @acq_rel_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_generic_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_generic_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5448,9 +5448,9 @@ define i64 @acq_rel_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_global_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_global_param_2]; ; SM90-NEXT: atom.acq_rel.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5464,9 +5464,9 @@ define i64 @acq_rel_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_acquire_i64_shared_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_acquire_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_acquire_i64_shared_param_2]; ; SM90-NEXT: atom.acq_rel.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5480,10 +5480,10 @@ define i64 @acq_rel_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5497,10 +5497,10 @@ define i64 @acq_rel_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5514,10 +5514,10 @@ define i64 @acq_rel_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_seq_cst_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5531,10 +5531,10 @@ define i64 @seq_cst_monotonic_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5548,10 +5548,10 @@ define i64 @seq_cst_monotonic_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5565,10 +5565,10 @@ define i64 @seq_cst_monotonic_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 % ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_monotonic_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_monotonic_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_monotonic_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5582,10 +5582,10 @@ define i64 @seq_cst_acquire_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5599,10 +5599,10 @@ define i64 @seq_cst_acquire_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5616,10 +5616,10 @@ define i64 @seq_cst_acquire_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_acquire_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_acquire_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_acquire_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_acquire_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_acquire_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_acquire_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5633,10 +5633,10 @@ define i64 @seq_cst_seq_cst_i64_generic(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_generic_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_generic_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_generic_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5650,10 +5650,10 @@ define i64 @seq_cst_seq_cst_i64_global(ptr addrspace(1) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_global_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_global_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_global_param_2]; ; SM90-NEXT: atom.acquire.global.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -5667,10 +5667,10 @@ define i64 @seq_cst_seq_cst_i64_shared(ptr addrspace(3) %addr, i64 %cmp, i64 %ne ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_seq_cst_i64_shared_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_seq_cst_i64_shared_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_seq_cst_i64_shared_param_2]; ; SM90-NEXT: atom.acquire.shared.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll index e5f05e49d2fe..9eeff9d7c2b7 100644 --- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll +++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll @@ -18,8 +18,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; ; SM30-NEXT: and.b32 %r10, %r9, 3; @@ -30,9 +30,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -59,8 +59,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -71,9 +71,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -99,8 +99,8 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [relaxed_sys_i8_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i8_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -111,9 +111,9 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [relaxed_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB0_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -144,8 +144,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; ; SM30-NEXT: and.b32 %r10, %r9, 3; @@ -156,9 +156,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -186,8 +186,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; ; SM70-NEXT: and.b32 %r10, %r9, 3; @@ -198,9 +198,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -227,8 +227,8 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acquire_sys_i8_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i8_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acquire_sys_i8_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i8_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; ; SM90-NEXT: and.b32 %r10, %r9, 3; @@ -239,9 +239,9 @@ define i8 @acquire_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acquire_sys_i8_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acquire_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB1_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -273,8 +273,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM30-NEXT: membar.sys; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; @@ -286,9 +286,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -315,8 +315,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -328,9 +328,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -356,8 +356,8 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [release_sys_i8_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i8_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [release_sys_i8_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i8_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -369,9 +369,9 @@ define i8 @release_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [release_sys_i8_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [release_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB2_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -402,8 +402,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM30-NEXT: membar.sys; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; @@ -415,9 +415,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -445,8 +445,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM70-NEXT: fence.acq_rel.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -458,9 +458,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -487,8 +487,8 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [acq_rel_sys_i8_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i8_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [acq_rel_sys_i8_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i8_param_0]; ; SM90-NEXT: fence.release.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -500,9 +500,9 @@ define i8 @acq_rel_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [acq_rel_sys_i8_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [acq_rel_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB3_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -534,8 +534,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM30-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM30-NEXT: membar.sys; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r9, %rd2; @@ -547,9 +547,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM30-NEXT: cvt.u32.u16 %r13, %rs1; ; SM30-NEXT: and.b32 %r14, %r13, 255; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; -; SM30-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM30-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; ; SM30-NEXT: shl.b32 %r4, %r15, %r1; -; SM30-NEXT: ld.u32 %r16, [%rd1]; +; SM30-NEXT: ld.b32 %r16, [%rd1]; ; SM30-NEXT: and.b32 %r20, %r16, %r2; ; SM30-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -577,8 +577,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM70-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM70-NEXT: fence.sc.sys; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r9, %rd2; @@ -590,9 +590,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM70-NEXT: cvt.u32.u16 %r13, %rs1; ; SM70-NEXT: and.b32 %r14, %r13, 255; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; -; SM70-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM70-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; ; SM70-NEXT: shl.b32 %r4, %r15, %r1; -; SM70-NEXT: ld.u32 %r16, [%rd1]; +; SM70-NEXT: ld.b32 %r16, [%rd1]; ; SM70-NEXT: and.b32 %r20, %r16, %r2; ; SM70-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -619,8 +619,8 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u8 %rs1, [seq_cst_sys_i8_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i8_param_0]; +; SM90-NEXT: ld.param.b8 %rs1, [seq_cst_sys_i8_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i8_param_0]; ; SM90-NEXT: fence.sc.sys; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r9, %rd2; @@ -632,9 +632,9 @@ define i8 @seq_cst_sys_i8(ptr %addr, i8 %cmp, i8 %new) { ; SM90-NEXT: cvt.u32.u16 %r13, %rs1; ; SM90-NEXT: and.b32 %r14, %r13, 255; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; -; SM90-NEXT: ld.param.u8 %r15, [seq_cst_sys_i8_param_1]; +; SM90-NEXT: ld.param.b8 %r15, [seq_cst_sys_i8_param_1]; ; SM90-NEXT: shl.b32 %r4, %r15, %r1; -; SM90-NEXT: ld.u32 %r16, [%rd1]; +; SM90-NEXT: ld.b32 %r16, [%rd1]; ; SM90-NEXT: and.b32 %r20, %r16, %r2; ; SM90-NEXT: $L__BB4_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -667,10 +667,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -680,7 +680,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -707,10 +707,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -720,7 +720,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -746,10 +746,10 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [relaxed_sys_i16_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i16_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [relaxed_sys_i16_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -759,7 +759,7 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB5_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -790,10 +790,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; -; SM30-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; ; SM30-NEXT: shl.b32 %r1, %r11, 3; @@ -803,7 +803,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -831,10 +831,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; -; SM70-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; ; SM70-NEXT: shl.b32 %r1, %r11, 3; @@ -844,7 +844,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -871,10 +871,10 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acquire_sys_i16_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acquire_sys_i16_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i16_param_0]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; -; SM90-NEXT: ld.param.u16 %r9, [acquire_sys_i16_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acquire_sys_i16_param_1]; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; ; SM90-NEXT: shl.b32 %r1, %r11, 3; @@ -884,7 +884,7 @@ define i16 @acquire_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB6_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -916,10 +916,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -930,7 +930,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -957,10 +957,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -971,7 +971,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -997,10 +997,10 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [release_sys_i16_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [release_sys_i16_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i16_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [release_sys_i16_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [release_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1011,7 +1011,7 @@ define i16 @release_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB7_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1042,10 +1042,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -1056,7 +1056,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1084,10 +1084,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM70-NEXT: fence.acq_rel.sys; -; SM70-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -1098,7 +1098,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1125,10 +1125,10 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [acq_rel_sys_i16_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [acq_rel_sys_i16_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i16_param_0]; ; SM90-NEXT: fence.release.sys; -; SM90-NEXT: ld.param.u16 %r9, [acq_rel_sys_i16_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [acq_rel_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1139,7 +1139,7 @@ define i16 @acq_rel_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB8_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1172,10 +1172,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: .reg .b64 %rd<3>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; -; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM30-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM30-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; ; SM30-NEXT: and.b64 %rd1, %rd2, -4; ; SM30-NEXT: cvt.u32.u64 %r10, %rd2; ; SM30-NEXT: and.b32 %r11, %r10, 3; @@ -1186,7 +1186,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM30-NEXT: cvt.u32.u16 %r14, %rs1; ; SM30-NEXT: shl.b32 %r3, %r14, %r1; ; SM30-NEXT: shl.b32 %r4, %r9, %r1; -; SM30-NEXT: ld.u32 %r15, [%rd1]; +; SM30-NEXT: ld.b32 %r15, [%rd1]; ; SM30-NEXT: and.b32 %r19, %r15, %r2; ; SM30-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM30-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1214,10 +1214,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: .reg .b64 %rd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM70-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM70-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; ; SM70-NEXT: and.b64 %rd1, %rd2, -4; ; SM70-NEXT: cvt.u32.u64 %r10, %rd2; ; SM70-NEXT: and.b32 %r11, %r10, 3; @@ -1228,7 +1228,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM70-NEXT: cvt.u32.u16 %r14, %rs1; ; SM70-NEXT: shl.b32 %r3, %r14, %r1; ; SM70-NEXT: shl.b32 %r4, %r9, %r1; -; SM70-NEXT: ld.u32 %r15, [%rd1]; +; SM70-NEXT: ld.b32 %r15, [%rd1]; ; SM70-NEXT: and.b32 %r19, %r15, %r2; ; SM70-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM70-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1255,10 +1255,10 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u16 %rs1, [seq_cst_sys_i16_param_2]; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i16_param_0]; +; SM90-NEXT: ld.param.b16 %rs1, [seq_cst_sys_i16_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i16_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u16 %r9, [seq_cst_sys_i16_param_1]; +; SM90-NEXT: ld.param.b16 %r9, [seq_cst_sys_i16_param_1]; ; SM90-NEXT: and.b64 %rd1, %rd2, -4; ; SM90-NEXT: cvt.u32.u64 %r10, %rd2; ; SM90-NEXT: and.b32 %r11, %r10, 3; @@ -1269,7 +1269,7 @@ define i16 @seq_cst_sys_i16(ptr %addr, i16 %cmp, i16 %new) { ; SM90-NEXT: cvt.u32.u16 %r14, %rs1; ; SM90-NEXT: shl.b32 %r3, %r14, %r1; ; SM90-NEXT: shl.b32 %r4, %r9, %r1; -; SM90-NEXT: ld.u32 %r15, [%rd1]; +; SM90-NEXT: ld.b32 %r15, [%rd1]; ; SM90-NEXT: and.b32 %r19, %r15, %r2; ; SM90-NEXT: $L__BB9_1: // %partword.cmpxchg.loop ; SM90-NEXT: // =>This Inner Loop Header: Depth=1 @@ -1300,9 +1300,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1313,9 +1313,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; ; SM70-NEXT: atom.relaxed.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1325,9 +1325,9 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i32_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [relaxed_sys_i32_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [relaxed_sys_i32_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i32_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [relaxed_sys_i32_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [relaxed_sys_i32_param_2]; ; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1342,9 +1342,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1355,9 +1355,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1367,9 +1367,9 @@ define i32 @acq_rel_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i32_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acq_rel_sys_i32_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acq_rel_sys_i32_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i32_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acq_rel_sys_i32_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acq_rel_sys_i32_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1384,9 +1384,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1397,9 +1397,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1409,9 +1409,9 @@ define i32 @acquire_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i32_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [acquire_sys_i32_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [acquire_sys_i32_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i32_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [acquire_sys_i32_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [acquire_sys_i32_param_2]; ; SM90-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1426,9 +1426,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; -; SM30-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; +; SM30-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1439,9 +1439,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; -; SM70-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; +; SM70-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; ; SM70-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1451,9 +1451,9 @@ define i32 @release_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i32_param_0]; -; SM90-NEXT: ld.param.u32 %r1, [release_sys_i32_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [release_sys_i32_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i32_param_0]; +; SM90-NEXT: ld.param.b32 %r1, [release_sys_i32_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [release_sys_i32_param_2]; ; SM90-NEXT: atom.release.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; ; SM90-NEXT: ret; @@ -1468,10 +1468,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM30-NEXT: .reg .b64 %rd<2>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; -; SM30-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM30-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; +; SM30-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; ; SM30-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM30-NEXT: st.param.b32 [func_retval0], %r2; ; SM30-NEXT: ret; @@ -1482,10 +1482,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; -; SM70-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM70-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; +; SM70-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; ; SM70-NEXT: atom.acquire.cas.b32 %r3, [%rd1], %r1, %r2; ; SM70-NEXT: st.param.b32 [func_retval0], %r2; ; SM70-NEXT: ret; @@ -1495,10 +1495,10 @@ define i32 @seq_cst_sys_i32(ptr %addr, i32 %cmp, i32 %new) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i32_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i32_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u32 %r1, [seq_cst_sys_i32_param_1]; -; SM90-NEXT: ld.param.u32 %r2, [seq_cst_sys_i32_param_2]; +; SM90-NEXT: ld.param.b32 %r1, [seq_cst_sys_i32_param_1]; +; SM90-NEXT: ld.param.b32 %r2, [seq_cst_sys_i32_param_2]; ; SM90-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2; ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b32 [func_retval0], %r2; @@ -1514,9 +1514,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1526,9 +1526,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; ; SM70-NEXT: atom.relaxed.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1537,9 +1537,9 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [relaxed_sys_i64_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [relaxed_sys_i64_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [relaxed_sys_i64_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [relaxed_sys_i64_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [relaxed_sys_i64_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [relaxed_sys_i64_param_2]; ; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1553,9 +1553,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1565,9 +1565,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1576,9 +1576,9 @@ define i64 @acquire_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acquire_sys_i64_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acquire_sys_i64_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acquire_sys_i64_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acquire_sys_i64_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acquire_sys_i64_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acquire_sys_i64_param_2]; ; SM90-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1592,9 +1592,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1604,9 +1604,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; ; SM70-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1615,9 +1615,9 @@ define i64 @acq_rel_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [acq_rel_sys_i64_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [acq_rel_sys_i64_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [acq_rel_sys_i64_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [acq_rel_sys_i64_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [acq_rel_sys_i64_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [acq_rel_sys_i64_param_2]; ; SM90-NEXT: atom.acq_rel.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1631,9 +1631,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; -; SM30-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1643,9 +1643,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; -; SM70-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; ; SM70-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1654,9 +1654,9 @@ define i64 @release_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [release_sys_i64_param_0]; -; SM90-NEXT: ld.param.u64 %rd2, [release_sys_i64_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [release_sys_i64_param_2]; +; SM90-NEXT: ld.param.b64 %rd1, [release_sys_i64_param_0]; +; SM90-NEXT: ld.param.b64 %rd2, [release_sys_i64_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [release_sys_i64_param_2]; ; SM90-NEXT: atom.release.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; ; SM90-NEXT: ret; @@ -1670,10 +1670,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM30-NEXT: .reg .b64 %rd<5>; ; SM30-EMPTY: ; SM30-NEXT: // %bb.0: -; SM30-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM30-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; ; SM30-NEXT: membar.sys; -; SM30-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; -; SM30-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM30-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; +; SM30-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; ; SM30-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM30-NEXT: st.param.b64 [func_retval0], %rd3; ; SM30-NEXT: ret; @@ -1683,10 +1683,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM70-NEXT: .reg .b64 %rd<5>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM70-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; ; SM70-NEXT: fence.sc.sys; -; SM70-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; -; SM70-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM70-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; +; SM70-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; ; SM70-NEXT: atom.acquire.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM70-NEXT: st.param.b64 [func_retval0], %rd3; ; SM70-NEXT: ret; @@ -1695,10 +1695,10 @@ define i64 @seq_cst_sys_i64(ptr %addr, i64 %cmp, i64 %new) { ; SM90-NEXT: .reg .b64 %rd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [seq_cst_sys_i64_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [seq_cst_sys_i64_param_0]; ; SM90-NEXT: fence.sc.sys; -; SM90-NEXT: ld.param.u64 %rd2, [seq_cst_sys_i64_param_1]; -; SM90-NEXT: ld.param.u64 %rd3, [seq_cst_sys_i64_param_2]; +; SM90-NEXT: ld.param.b64 %rd2, [seq_cst_sys_i64_param_1]; +; SM90-NEXT: ld.param.b64 %rd3, [seq_cst_sys_i64_param_2]; ; SM90-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3; ; SM90-NEXT: fence.acquire.sys; ; SM90-NEXT: st.param.b64 [func_retval0], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/combine-mad.ll b/llvm/test/CodeGen/NVPTX/combine-mad.ll index 319cadcb27f0..dc6d504c2c66 100644 --- a/llvm/test/CodeGen/NVPTX/combine-mad.ll +++ b/llvm/test/CodeGen/NVPTX/combine-mad.ll @@ -11,8 +11,8 @@ define i32 @test1(i32 %n, i32 %m) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test1_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test1_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test1_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test1_param_1]; ; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -28,8 +28,8 @@ define i32 @test1_rev(i32 %n, i32 %m) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test1_rev_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test1_rev_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test1_rev_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test1_rev_param_1]; ; CHECK-NEXT: mad.lo.s32 %r3, %r2, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -47,9 +47,9 @@ define i32 @test2(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test2_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test2_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test2_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test2_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test2_param_2]; ; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1; ; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2; ; CHECK-NEXT: selp.b32 %r5, %r2, %r4, %p1; @@ -71,9 +71,9 @@ define i32 @test2_rev1(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test2_rev1_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test2_rev1_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test2_rev1_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test2_rev1_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test2_rev1_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test2_rev1_param_2]; ; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1; ; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2; ; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1; @@ -95,9 +95,9 @@ define i32 @test2_rev2(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test2_rev2_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test2_rev2_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test2_rev2_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test2_rev2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test2_rev2_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test2_rev2_param_2]; ; CHECK-NEXT: setp.lt.s32 %p1, %r3, 1; ; CHECK-NEXT: mad.lo.s32 %r4, %r2, %r1, %r2; ; CHECK-NEXT: selp.b32 %r5, %r4, %r2, %p1; @@ -119,10 +119,10 @@ define i32 @test3(i32 %n, i32 %m, i32 %s) { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test3_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test3_param_0]; ; CHECK-NEXT: add.s32 %r2, %r1, 3; -; CHECK-NEXT: ld.param.u32 %r3, [test3_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [test3_param_2]; +; CHECK-NEXT: ld.param.b32 %r3, [test3_param_1]; +; CHECK-NEXT: ld.param.b32 %r4, [test3_param_2]; ; CHECK-NEXT: setp.lt.s32 %p1, %r4, 1; ; CHECK-NEXT: selp.b32 %r5, 1, %r2, %p1; ; CHECK-NEXT: mul.lo.s32 %r6, %r5, %r3; @@ -144,12 +144,12 @@ define i32 @test4(i32 %a, i32 %b, i32 %c, i1 %p) { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test4_param_3]; +; CHECK-NEXT: ld.param.b8 %rs1, [test4_param_3]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.u32 %r1, [test4_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test4_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test4_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test4_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test4_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test4_param_2]; ; CHECK-NEXT: mad.lo.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: selp.b32 %r5, %r4, %r3, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; @@ -168,12 +168,12 @@ define i32 @test4_rev(i32 %a, i32 %b, i32 %c, i1 %p) { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test4_rev_param_3]; +; CHECK-NEXT: ld.param.b8 %rs1, [test4_rev_param_3]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.u32 %r1, [test4_rev_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test4_rev_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test4_rev_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test4_rev_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test4_rev_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test4_rev_param_2]; ; CHECK-NEXT: mad.lo.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: selp.b32 %r5, %r3, %r4, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r5; @@ -192,10 +192,10 @@ define i32 @test_mad_multi_use(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_mad_multi_use_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_mad_multi_use_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_mad_multi_use_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_mad_multi_use_param_1]; ; CHECK-NEXT: mul.lo.s32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.u32 %r4, [test_mad_multi_use_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [test_mad_multi_use_param_2]; ; CHECK-NEXT: add.s32 %r5, %r3, %r4; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b32 param0; @@ -227,7 +227,7 @@ define i32 @test_mad_fold(i32 %x) { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_mad_fold_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_mad_fold_param_0]; ; CHECK-NEXT: mul.hi.s32 %r2, %r1, -2147221471; ; CHECK-NEXT: add.s32 %r3, %r2, %r1; ; CHECK-NEXT: shr.u32 %r4, %r3, 31; diff --git a/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll b/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll index 670e112c26c7..cfb064c85e07 100644 --- a/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll +++ b/llvm/test/CodeGen/NVPTX/convert-fp-i8.ll @@ -11,7 +11,7 @@ define i8 @cvt_u8_f32(float %x) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_u8_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_u8_f32_param_0]; ; CHECK-NEXT: cvt.rzi.u16.f32 %rs1, %f1; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -28,7 +28,7 @@ define i8 @cvt_u8_f64(double %x) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [cvt_u8_f64_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [cvt_u8_f64_param_0]; ; CHECK-NEXT: cvt.rzi.u16.f64 %rs1, %fd1; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -44,9 +44,9 @@ define float @cvt_f32_i8(i8 %x) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f32_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [cvt_f32_i8_param_0]; ; CHECK-NEXT: cvt.rn.f32.u16 %f1, %rs1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %f1; ; CHECK-NEXT: ret; %a = uitofp i8 %x to float ret float %a @@ -59,9 +59,9 @@ define double @cvt_f64_i8(i8 %x) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [cvt_f64_i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [cvt_f64_i8_param_0]; ; CHECK-NEXT: cvt.rn.f64.u16 %fd1, %rs1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd1; ; CHECK-NEXT: ret; %a = uitofp i8 %x to double ret double %a @@ -76,7 +76,7 @@ define float @cvt_f32_s8(i8 %x) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f32_s8_param_0]; ; CHECK-NEXT: cvt.rn.f32.s16 %f1, %rs1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %f1; ; CHECK-NEXT: ret; %a = sitofp i8 %x to float ret float %a @@ -91,7 +91,7 @@ define double @cvt_f64_s8(i8 %x) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.s8 %rs1, [cvt_f64_s8_param_0]; ; CHECK-NEXT: cvt.rn.f64.s16 %fd1, %rs1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd1; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd1; ; CHECK-NEXT: ret; %a = sitofp i8 %x to double ret double %a @@ -105,7 +105,7 @@ define i8 @cvt_s8_f32(float %x) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_s8_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_s8_f32_param_0]; ; CHECK-NEXT: cvt.rzi.s16.f32 %rs1, %f1; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: and.b32 %r2, %r1, 255; @@ -123,7 +123,7 @@ define i8 @cvt_s8_f64(double %x) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [cvt_s8_f64_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [cvt_s8_f64_param_0]; ; CHECK-NEXT: cvt.rzi.s16.f64 %rs1, %fd1; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: and.b32 %r2, %r1, 255; diff --git a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll index 15dd899b714a..ce6a16d9c040 100644 --- a/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll +++ b/llvm/test/CodeGen/NVPTX/convert-int-sm20.ll @@ -10,7 +10,7 @@ ; i16 define i16 @cvt_i16_i32(i32 %x) { -; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}] +; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i16_i32_param_{{[0-9]+}}] ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = trunc i32 %x to i16 @@ -18,7 +18,7 @@ define i16 @cvt_i16_i32(i32 %x) { } define i16 @cvt_i16_i64(i64 %x) { -; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}] +; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i16_i64_param_{{[0-9]+}}] ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = trunc i64 %x to i16 @@ -30,7 +30,7 @@ define i16 @cvt_i16_i64(i64 %x) { ; i32 define i32 @cvt_i32_i16(i16 %x) { -; CHECK: ld.param.u16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}] +; CHECK: ld.param.b16 %r[[R0:[0-9]+]], [cvt_i32_i16_param_{{[0-9]+}}] ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = zext i16 %x to i32 @@ -38,7 +38,7 @@ define i32 @cvt_i32_i16(i16 %x) { } define i32 @cvt_i32_i64(i64 %x) { -; CHECK: ld.param.u32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}] +; CHECK: ld.param.b32 %r[[R0:[0-9]+]], [cvt_i32_i64_param_{{[0-9]+}}] ; CHECK: st.param.b32 [func_retval{{[0-9]+}}], %r[[R0]] ; CHECK: ret %a = trunc i64 %x to i32 @@ -50,7 +50,7 @@ define i32 @cvt_i32_i64(i64 %x) { ; i64 define i64 @cvt_i64_i16(i16 %x) { -; CHECK: ld.param.u16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] +; CHECK: ld.param.b16 %rd[[R0:[0-9]+]], [cvt_i64_i16_param_{{[0-9]+}}] ; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]] ; CHECK: ret %a = zext i16 %x to i64 @@ -58,7 +58,7 @@ define i64 @cvt_i64_i16(i16 %x) { } define i64 @cvt_i64_i32(i32 %x) { -; CHECK: ld.param.u32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] +; CHECK: ld.param.b32 %rd[[R0:[0-9]+]], [cvt_i64_i32_param_{{[0-9]+}}] ; CHECK: st.param.b64 [func_retval{{[0-9]+}}], %rd[[R0]] ; CHECK: ret %a = zext i32 %x to i64 diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100.ll b/llvm/test/CodeGen/NVPTX/convert-sm100.ll index 7230872b3427..d5fe45f8051f 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100.ll @@ -14,7 +14,7 @@ define i32 @cvt_rn_satf_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_satf_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_satf_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rn.satfinite.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -29,7 +29,7 @@ define i32 @cvt_rn_relu_satf_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_satf_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rn.relu.satfinite.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -44,7 +44,7 @@ define i32 @cvt_rz_satf_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_satf_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_satf_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rz.satfinite.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -59,7 +59,7 @@ define i32 @cvt_rz_relu_satf_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_satf_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rz.relu.satfinite.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll index 04d7a65f9e40..def2575deb04 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm100a.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm100a.ll @@ -14,8 +14,8 @@ define i16 @cvt_rn_sf_e2m3x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_sf_e2m3x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_sf_e2m3x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.satfinite.e2m3x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -32,8 +32,8 @@ define i16 @cvt_rn_relu_sf_e2m3x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_sf_e2m3x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_sf_e2m3x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.satfinite.relu.e2m3x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -50,8 +50,8 @@ define i16 @cvt_rn_sf_e3m2x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_sf_e3m2x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_sf_e3m2x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.satfinite.e3m2x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -68,8 +68,8 @@ define i16 @cvt_rn_relu_sf_e3m2x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_sf_e3m2x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_sf_e3m2x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.satfinite.relu.e3m2x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -85,7 +85,7 @@ define <2 x half> @cvt_rn_f16x2_e2m3x2(i16 %in) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_f16x2_e2m3x2_param_0]; ; CHECK-NEXT: cvt.rn.f16x2.e2m3x2 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -100,7 +100,7 @@ define <2 x half> @cvt_rn_relu_f16x2_e2m3x2_relu(i16 %in) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e2m3x2_relu_param_0]; ; CHECK-NEXT: cvt.rn.relu.f16x2.e2m3x2 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -115,7 +115,7 @@ define <2 x half> @cvt_rn_f16x2_e3m2x2(i16 %in) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_f16x2_e3m2x2_param_0]; ; CHECK-NEXT: cvt.rn.f16x2.e3m2x2 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -130,7 +130,7 @@ define <2 x half> @cvt_rn_relu_f16x2_e3m2x2(i16 %in) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_rn_relu_f16x2_e3m2x2_param_0]; ; CHECK-NEXT: cvt.rn.relu.f16x2.e3m2x2 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -146,8 +146,8 @@ define i16 @cvt_rz_ue8m0x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_ue8m0x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_ue8m0x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_ue8m0x2_f32_param_1]; ; CHECK-NEXT: cvt.rz.ue8m0x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -164,8 +164,8 @@ define i16 @cvt_rz_sf_ue8m0x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_sf_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_sf_ue8m0x2_f32_param_1]; ; CHECK-NEXT: cvt.rz.satfinite.ue8m0x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -182,8 +182,8 @@ define i16 @cvt_rp_ue8m0x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_ue8m0x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_ue8m0x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rp_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rp_ue8m0x2_f32_param_1]; ; CHECK-NEXT: cvt.rp.ue8m0x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -200,8 +200,8 @@ define i16 @cvt_rp_sf_ue8m0x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rp_sf_ue8m0x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rp_sf_ue8m0x2_f32_param_1]; ; CHECK-NEXT: cvt.rp.satfinite.ue8m0x2.f32 %rs1, %f1, %f2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -281,7 +281,7 @@ define <2 x bfloat> @cvt_bf16x2_ue8m0x2(i16 %in) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [cvt_bf16x2_ue8m0x2_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [cvt_bf16x2_ue8m0x2_param_0]; ; CHECK-NEXT: cvt.rn.bf16x2.ue8m0x2 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/convert-sm80.ll b/llvm/test/CodeGen/NVPTX/convert-sm80.ll index eb7a6bdd222b..0372d281ea35 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm80.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm80.ll @@ -10,8 +10,8 @@ define <2 x bfloat> @cvt_rn_bf16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_bf16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_bf16x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -26,8 +26,8 @@ define <2 x bfloat> @cvt_rn_relu_bf16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_bf16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_bf16x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.relu.bf16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -42,8 +42,8 @@ define <2 x bfloat> @cvt_rz_bf16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_bf16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_bf16x2_f32_param_1]; ; CHECK-NEXT: cvt.rz.bf16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -58,8 +58,8 @@ define <2 x bfloat> @cvt_rz_relu_bf16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_bf16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_relu_bf16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_bf16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_relu_bf16x2_f32_param_1]; ; CHECK-NEXT: cvt.rz.relu.bf16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -79,8 +79,8 @@ define <2 x half> @cvt_rn_f16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_f16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_f16x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.f16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -95,8 +95,8 @@ define <2 x half> @cvt_rn_relu_f16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_f16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rn_relu_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rn_relu_f16x2_f32_param_1]; ; CHECK-NEXT: cvt.rn.relu.f16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -111,8 +111,8 @@ define <2 x half> @cvt_rz_f16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_f16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_f16x2_f32_param_1]; ; CHECK-NEXT: cvt.rz.f16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -127,8 +127,8 @@ define <2 x half> @cvt_rz_relu_f16x2_f32(float %f1, float %f2) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_f16x2_f32_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [cvt_rz_relu_f16x2_f32_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_f16x2_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [cvt_rz_relu_f16x2_f32_param_1]; ; CHECK-NEXT: cvt.rz.relu.f16x2.f32 %r1, %f1, %f2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -148,7 +148,7 @@ define bfloat @cvt_rn_bf16_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_bf16_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_bf16_f32_param_0]; ; CHECK-NEXT: cvt.rn.bf16.f32 %rs1, %f1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; @@ -163,7 +163,7 @@ define bfloat @cvt_rn_relu_bf16_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_bf16_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_bf16_f32_param_0]; ; CHECK-NEXT: cvt.rn.relu.bf16.f32 %rs1, %f1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; @@ -178,7 +178,7 @@ define bfloat @cvt_rz_bf16_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_bf16_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_bf16_f32_param_0]; ; CHECK-NEXT: cvt.rz.bf16.f32 %rs1, %f1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; @@ -193,7 +193,7 @@ define bfloat @cvt_rz_relu_bf16_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_bf16_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_bf16_f32_param_0]; ; CHECK-NEXT: cvt.rz.relu.bf16.f32 %rs1, %f1; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; @@ -213,7 +213,7 @@ define i32 @cvt_rna_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rna_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rna_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rna.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -231,8 +231,8 @@ define <2 x bfloat> @fold_ff2bf16x2(float %lo, float %hi) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fold_ff2bf16x2_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [fold_ff2bf16x2_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [fold_ff2bf16x2_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [fold_ff2bf16x2_param_1]; ; CHECK-NEXT: cvt.rn.bf16x2.f32 %r1, %f2, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -250,8 +250,8 @@ define <2 x half> @fold_ff2f16x2(float %lo, float %hi) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fold_ff2f16x2_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [fold_ff2f16x2_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [fold_ff2f16x2_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [fold_ff2f16x2_param_1]; ; CHECK-NEXT: cvt.rn.f16x2.f32 %r1, %f2, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/convert-sm90.ll b/llvm/test/CodeGen/NVPTX/convert-sm90.ll index 340117f98cd9..dba8be1ef5a4 100644 --- a/llvm/test/CodeGen/NVPTX/convert-sm90.ll +++ b/llvm/test/CodeGen/NVPTX/convert-sm90.ll @@ -14,7 +14,7 @@ define i32 @cvt_rn_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rn.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -29,7 +29,7 @@ define i32 @cvt_rn_relu_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rn_relu_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rn_relu_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rn.relu.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -44,7 +44,7 @@ define i32 @cvt_rz_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rz.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -59,7 +59,7 @@ define i32 @cvt_rz_relu_tf32_f32(float %f1) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [cvt_rz_relu_tf32_f32_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [cvt_rz_relu_tf32_f32_param_0]; ; CHECK-NEXT: cvt.rz.relu.tf32.f32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/copysign.ll b/llvm/test/CodeGen/NVPTX/copysign.ll index 2e305e683d77..d8198182220e 100644 --- a/llvm/test/CodeGen/NVPTX/copysign.ll +++ b/llvm/test/CodeGen/NVPTX/copysign.ll @@ -11,10 +11,10 @@ define float @fcopysign_f_f(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_f_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [fcopysign_f_f_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [fcopysign_f_f_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [fcopysign_f_f_param_1]; ; CHECK-NEXT: copysign.f32 %f3, %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %val = call float @llvm.copysign.f32(float %a, float %b) ret float %val @@ -26,10 +26,10 @@ define double @fcopysign_d_d(double %a, double %b) { ; CHECK-NEXT: .reg .b64 %fd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_d_param_0]; -; CHECK-NEXT: ld.param.f64 %fd2, [fcopysign_d_d_param_1]; +; CHECK-NEXT: ld.param.b64 %fd1, [fcopysign_d_d_param_0]; +; CHECK-NEXT: ld.param.b64 %fd2, [fcopysign_d_d_param_1]; ; CHECK-NEXT: copysign.f64 %fd3, %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %val = call double @llvm.copysign.f64(double %a, double %b) ret double %val @@ -43,15 +43,15 @@ define float @fcopysign_f_d(float %a, double %b) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_d_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [fcopysign_f_d_param_0]; ; CHECK-NEXT: abs.f32 %f2, %f1; ; CHECK-NEXT: neg.f32 %f3, %f2; -; CHECK-NEXT: ld.param.u64 %rd1, [fcopysign_f_d_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [fcopysign_f_d_param_1]; ; CHECK-NEXT: shr.u64 %rd2, %rd1, 63; ; CHECK-NEXT: and.b64 %rd3, %rd2, 1; ; CHECK-NEXT: setp.ne.b64 %p1, %rd3, 0; ; CHECK-NEXT: selp.f32 %f4, %f3, %f2, %p1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NEXT: ret; %c = fptrunc double %b to float %val = call float @llvm.copysign.f32(float %a, float %c) @@ -66,15 +66,15 @@ define float @fcopysign_f_h(float %a, half %b) { ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fcopysign_f_h_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [fcopysign_f_h_param_0]; ; CHECK-NEXT: abs.f32 %f2, %f1; ; CHECK-NEXT: neg.f32 %f3, %f2; -; CHECK-NEXT: ld.param.u16 %rs1, [fcopysign_f_h_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [fcopysign_f_h_param_1]; ; CHECK-NEXT: shr.u16 %rs2, %rs1, 15; ; CHECK-NEXT: and.b16 %rs3, %rs2, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs3, 0; ; CHECK-NEXT: selp.f32 %f4, %f3, %f2, %p1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NEXT: ret; %c = fpext half %b to float %val = call float @llvm.copysign.f32(float %a, float %c) @@ -89,15 +89,15 @@ define double @fcopysign_d_f(double %a, float %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_f_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [fcopysign_d_f_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: ld.param.u32 %r1, [fcopysign_d_f_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [fcopysign_d_f_param_1]; ; CHECK-NEXT: shr.u32 %r2, %r1, 31; ; CHECK-NEXT: and.b32 %r3, %r2, 1; ; CHECK-NEXT: setp.ne.b32 %p1, %r3, 0; ; CHECK-NEXT: selp.f64 %fd4, %fd3, %fd2, %p1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd4; ; CHECK-NEXT: ret; %c = fpext float %b to double %val = call double @llvm.copysign.f64(double %a, double %c) @@ -112,15 +112,15 @@ define double @fcopysign_d_h(double %a, half %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [fcopysign_d_h_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [fcopysign_d_h_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: ld.param.u16 %rs1, [fcopysign_d_h_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [fcopysign_d_h_param_1]; ; CHECK-NEXT: shr.u16 %rs2, %rs1, 15; ; CHECK-NEXT: and.b16 %rs3, %rs2, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs3, 0; ; CHECK-NEXT: selp.f64 %fd4, %fd3, %fd2, %p1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd4; ; CHECK-NEXT: ret; %c = fpext half %b to double %val = call double @llvm.copysign.f64(double %a, double %c) diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll index 530a896642b8..b27f3078300b 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s.ll @@ -25,14 +25,14 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1}], [%rd2]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -44,14 +44,14 @@ define void @cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_1d_param_3]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3}], [%r2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_1d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_1d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -74,15 +74,15 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2}], [%rd2]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -94,15 +94,15 @@ define void @cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_2d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4}], [%r2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_2d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -125,16 +125,16 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -146,16 +146,16 @@ define void @cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_3d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -178,17 +178,17 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -200,17 +200,17 @@ define void @cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_4d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_4d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -233,18 +233,18 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; -; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -256,18 +256,18 @@ define void @cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_tile_5d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_tile_5d_param_9]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_tile_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -290,17 +290,17 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -312,17 +312,17 @@ define void @cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_3d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_3d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -345,19 +345,19 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; -; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -369,19 +369,19 @@ define void @cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_4d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_4d_param_10]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_4d_param_9]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -404,21 +404,21 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; -; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; -; CHECK-PTX64-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; -; CHECK-PTX64-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -430,21 +430,21 @@ define void @cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspa ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_g2s_im2col_5d_param_10]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_g2s_im2col_5d_param_12]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [cp_async_bulk_tensor_g2s_im2col_5d_param_11]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll index 262df0777a20..c32c5559b159 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll @@ -22,10 +22,10 @@ define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch) ; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2; ; CHECK-PTX-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 undef, i1 0) @@ -41,11 +41,11 @@ define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0 ; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2; ; CHECK-PTX-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) @@ -62,14 +62,14 @@ define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 ; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2; -; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5]; +; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1}; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3}], {%rs1}, %rd2; ; CHECK-PTX-NEXT: ret; @@ -90,16 +90,16 @@ define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 ; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4]; -; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2; -; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6]; -; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7]; +; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6]; +; CHECK-PTX-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], {%rs1, %rs2}, %rd2; ; CHECK-PTX-NEXT: ret; @@ -120,18 +120,18 @@ define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 ; CHECK-PTX-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4]; -; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5]; -; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2; -; CHECK-PTX-NEXT: ld.param.u16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7]; -; CHECK-PTX-NEXT: ld.param.u16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8]; -; CHECK-PTX-NEXT: ld.param.u16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9]; +; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7]; +; CHECK-PTX-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8]; +; CHECK-PTX-NEXT: ld.param.b16 %rs3, [cp_async_bulk_tensor_prefetch_5d_param_9]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.im2col.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], {%rs1, %rs2, %rs3}, %rd2; ; CHECK-PTX-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll index b0c39de17811..b73631d219ba 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-reduce.ll @@ -22,10 +22,10 @@ define void @cp_async_bulk_tensor_reduce_tile_1d(ptr addrspace(3) %src, ptr %tma ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_1d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_1d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_1d_param_2]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_1d_param_3]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.1d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; @@ -71,11 +71,11 @@ define void @cp_async_bulk_tensor_reduce_tile_2d(ptr addrspace(3) %src, ptr %tma ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_2d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_2d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_2d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_2d_param_3]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_2d_param_4]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.2d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; @@ -121,12 +121,12 @@ define void @cp_async_bulk_tensor_reduce_tile_3d(ptr addrspace(3) %src, ptr %tma ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_3d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_3d_param_4]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_3d_param_5]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; @@ -172,13 +172,13 @@ define void @cp_async_bulk_tensor_reduce_tile_4d(ptr addrspace(3) %src, ptr %tma ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4]; -; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_4d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_tile_4d_param_5]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_4d_param_6]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; @@ -224,14 +224,14 @@ define void @cp_async_bulk_tensor_reduce_tile_5d(ptr addrspace(3) %src, ptr %tma ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4]; -; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5]; -; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_tile_5d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_tile_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_tile_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_tile_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_tile_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_tile_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_reduce_tile_5d_param_6]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_tile_5d_param_7]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; @@ -277,12 +277,12 @@ define void @cp_async_bulk_tensor_reduce_im2col_3d(ptr addrspace(3) %src, ptr %t ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_3d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_3d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_3d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_3d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_3d_param_4]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_3d_param_5]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.3d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; @@ -328,13 +328,13 @@ define void @cp_async_bulk_tensor_reduce_im2col_4d(ptr addrspace(3) %src, ptr %t ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4]; -; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_4d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_4d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_4d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_4d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_4d_param_4]; +; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_im2col_4d_param_5]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_4d_param_6]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.4d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; @@ -380,14 +380,14 @@ define void @cp_async_bulk_tensor_reduce_im2col_5d(ptr addrspace(3) %src, ptr %t ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1]; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2]; -; CHECK-PTX-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3]; -; CHECK-PTX-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4]; -; CHECK-PTX-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5]; -; CHECK-PTX-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6]; -; CHECK-PTX-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_reduce_im2col_5d_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_reduce_im2col_5d_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_reduce_im2col_5d_param_2]; +; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_reduce_im2col_5d_param_3]; +; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_reduce_im2col_5d_param_4]; +; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_reduce_im2col_5d_param_5]; +; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_reduce_im2col_5d_param_6]; +; CHECK-PTX-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_reduce_im2col_5d_param_7]; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.min.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; ; CHECK-PTX-NEXT: cp.reduce.async.bulk.tensor.5d.global.shared::cta.max.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll index de3d9ddaac9c..6a366f658c77 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll @@ -24,11 +24,11 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap, ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -38,11 +38,11 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap, ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 undef, i1 0) @@ -58,12 +58,12 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src, ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -73,12 +73,12 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src, ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 undef, i1 0) @@ -94,13 +94,13 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; @@ -112,13 +112,13 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2; @@ -139,14 +139,14 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; @@ -158,14 +158,14 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2; @@ -186,15 +186,15 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3]; -; CHECK-PTX64-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4]; -; CHECK-PTX64-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5]; -; CHECK-PTX64-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6]; -; CHECK-PTX64-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; @@ -206,15 +206,15 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll index bf1b86e37ae7..77694ac82459 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -19,14 +19,14 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr ; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_g2s_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_g2s_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_3]; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_g2s_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [cp_async_bulk_g2s_param_5]; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4; -; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4]; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4; ; CHECK-PTX64-NEXT: ret; @@ -38,14 +38,14 @@ define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_g2s_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_g2s_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_g2s_param_3]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_g2s_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_g2s_param_5]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [cp_async_bulk_g2s_param_4]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; @@ -63,11 +63,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_param_1]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_param_2]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_1]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_2]; ; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_s2g_param_3]; ; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -77,11 +77,11 @@ define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_s2g_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_s2g_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_s2g_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_s2g_param_2]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_s2g_param_3]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0) @@ -96,10 +96,10 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3 ; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_cta_to_cluster_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_cta_to_cluster_param_1]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_cta_to_cluster_param_2]; -; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_3]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_cta_to_cluster_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_cta_to_cluster_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_cta_to_cluster_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_cta_to_cluster_param_3]; ; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; ; CHECK-PTX64-NEXT: ret; ; @@ -108,10 +108,10 @@ define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3 ; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_cta_to_cluster_param_1]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_cta_to_cluster_param_2]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_cta_to_cluster_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_cta_to_cluster_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_cta_to_cluster_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_cta_to_cluster_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_cta_to_cluster_param_3]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%r3], [%r1], %r4, [%r2]; ; CHECK-PTX-SHARED32-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size) @@ -125,9 +125,9 @@ define void @cp_async_bulk_prefetch(ptr addrspace(1) %src, i32 %size, i64 %ch) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [cp_async_bulk_prefetch_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [cp_async_bulk_prefetch_param_1]; -; CHECK-NEXT: ld.param.u64 %rd2, [cp_async_bulk_prefetch_param_2]; +; CHECK-NEXT: ld.param.b64 %rd1, [cp_async_bulk_prefetch_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [cp_async_bulk_prefetch_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [cp_async_bulk_prefetch_param_2]; ; CHECK-NEXT: cp.async.bulk.prefetch.L2.global.L2::cache_hint [%rd1], %r1, %rd2; ; CHECK-NEXT: cp.async.bulk.prefetch.L2.global [%rd1], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/ctlz.ll b/llvm/test/CodeGen/NVPTX/ctlz.ll index 1443e5c46346..1c3f69943803 100644 --- a/llvm/test/CodeGen/NVPTX/ctlz.ll +++ b/llvm/test/CodeGen/NVPTX/ctlz.ll @@ -17,7 +17,7 @@ define i32 @myctlz(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [myctlz_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [myctlz_param_0]; ; CHECK-NEXT: clz.b32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -30,7 +30,7 @@ define i32 @myctlz_2(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [myctlz_2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [myctlz_2_param_0]; ; CHECK-NEXT: clz.b32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -47,7 +47,7 @@ define i64 @myctlz64(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_param_0]; ; CHECK-NEXT: clz.b64 %r1, %rd1; ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; @@ -62,7 +62,7 @@ define i64 @myctlz64_2(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_2_param_0]; ; CHECK-NEXT: clz.b64 %r1, %rd1; ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; @@ -81,7 +81,7 @@ define i32 @myctlz64_as_32(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_as_32_param_0]; ; CHECK-NEXT: clz.b64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -96,7 +96,7 @@ define i32 @myctlz64_as_32_2(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [myctlz64_as_32_2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [myctlz64_as_32_2_param_0]; ; CHECK-NEXT: clz.b64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -115,7 +115,7 @@ define i16 @myctlz_ret16(i16 %a) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [myctlz_ret16_param_0]; ; CHECK-NEXT: clz.b32 %r2, %r1; ; CHECK-NEXT: add.s32 %r3, %r2, -16; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -129,7 +129,7 @@ define i16 @myctlz_ret16_2(i16 %a) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %r1, [myctlz_ret16_2_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [myctlz_ret16_2_param_0]; ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: clz.b32 %r3, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -147,11 +147,11 @@ define void @myctlz_store16(i16 %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [myctlz_store16_param_0]; ; CHECK-NEXT: clz.b32 %r2, %r1; ; CHECK-NEXT: add.s32 %r3, %r2, -16; -; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_param_1]; -; CHECK-NEXT: st.u16 [%rd1], %r3; +; CHECK-NEXT: ld.param.b64 %rd1, [myctlz_store16_param_1]; +; CHECK-NEXT: st.b16 [%rd1], %r3; ; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone store i16 %val, ptr %b @@ -164,11 +164,11 @@ define void @myctlz_store16_2(i16 %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %r1, [myctlz_store16_2_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [myctlz_store16_2_param_0]; ; CHECK-NEXT: clz.b32 %r2, %r1; ; CHECK-NEXT: add.s32 %r3, %r2, -16; -; CHECK-NEXT: ld.param.u64 %rd1, [myctlz_store16_2_param_1]; -; CHECK-NEXT: st.u16 [%rd1], %r3; +; CHECK-NEXT: ld.param.b64 %rd1, [myctlz_store16_2_param_1]; +; CHECK-NEXT: st.b16 [%rd1], %r3; ; CHECK-NEXT: ret; %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone store i16 %val, ptr %b diff --git a/llvm/test/CodeGen/NVPTX/dag-cse.ll b/llvm/test/CodeGen/NVPTX/dag-cse.ll index ff22c0bd747e..84a38fd30963 100644 --- a/llvm/test/CodeGen/NVPTX/dag-cse.ll +++ b/llvm/test/CodeGen/NVPTX/dag-cse.ll @@ -9,11 +9,11 @@ ; Verify that loads with different memory types are not subject to CSE ; once they are promoted to the same type. ; -; CHECK: ld.global.v2.u8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a]; -; CHECK: st.global.v2.u8 [b], {%[[B1]], %[[B2]]}; +; CHECK: ld.global.v2.b8 {%[[B1:rs[0-9]+]], %[[B2:rs[0-9]+]]}, [a]; +; CHECK: st.global.v2.b8 [b], {%[[B1]], %[[B2]]}; ; -; CHECK: ld.global.u32 %[[C:r[0-9]+]], [a]; -; CHECK: st.global.u32 [c], %[[C]]; +; CHECK: ld.global.b32 %[[C:r[0-9]+]], [a]; +; CHECK: st.global.b32 [c], %[[C]]; define void @test1() #0 { %1 = load <2 x i8>, ptr addrspace(1) @a, align 8 diff --git a/llvm/test/CodeGen/NVPTX/demote-vars.ll b/llvm/test/CodeGen/NVPTX/demote-vars.ll index 16ae80dca1ed..ab89b62b53d0 100644 --- a/llvm/test/CodeGen/NVPTX/demote-vars.ll +++ b/llvm/test/CodeGen/NVPTX/demote-vars.ll @@ -66,9 +66,9 @@ define void @define_private_global(i64 %val) { ; ; Also check that the if-then is still here, otherwise we may not be testing ; the "more-than-one-use" part. -; CHECK: st.shared.u64 [private_global_used_more_than_once_in_same_fct], +; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct], ; CHECK: mov.b64 %[[VAR:.*]], 25 -; CHECK: st.shared.u64 [private_global_used_more_than_once_in_same_fct], %[[VAR]] +; CHECK: st.shared.b64 [private_global_used_more_than_once_in_same_fct], %[[VAR]] define void @define_private_global_more_than_one_use(i64 %val, i1 %cond) { store i64 %val, ptr addrspace(3) @private_global_used_more_than_once_in_same_fct br i1 %cond, label %then, label %end diff --git a/llvm/test/CodeGen/NVPTX/discard.ll b/llvm/test/CodeGen/NVPTX/discard.ll index 8e5c9bab97c8..ce72f5f52b8a 100644 --- a/llvm/test/CodeGen/NVPTX/discard.ll +++ b/llvm/test/CodeGen/NVPTX/discard.ll @@ -13,7 +13,7 @@ define void @discard_global_L2(ptr addrspace(1) %global_ptr) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_global_L2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [discard_global_L2_param_0]; ; CHECK-PTX64-NEXT: discard.global.L2 [%rd1], 128; ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.discard.global.L2(ptr addrspace(1) %global_ptr, i64 128) @@ -26,7 +26,7 @@ define void @discard_L2(ptr %ptr) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [discard_L2_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [discard_L2_param_0]; ; CHECK-PTX64-NEXT: discard.L2 [%rd1], 128; ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.discard.L2(ptr %ptr, i64 128) diff --git a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll index b0e2082621bf..a21261c76886 100644 --- a/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll +++ b/llvm/test/CodeGen/NVPTX/disjoint-or-addr.ll @@ -14,7 +14,7 @@ define i32 @test_disjoint_or_addr(i16 %a) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %rd1, a; ; CHECK-NEXT: cvta.global.u64 %rd2, %rd1; -; CHECK-NEXT: ld.u32 %r1, [%rd2+8]; +; CHECK-NEXT: ld.b32 %r1, [%rd2+8]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a1 = ptrtoint ptr @a to i64 diff --git a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll index a23361656308..cea3ac37c196 100644 --- a/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll +++ b/llvm/test/CodeGen/NVPTX/distributed-shared-cluster.ll @@ -18,17 +18,17 @@ define i32 @test_distributed_shared_cluster_common(ptr %ptr, ptr addrspace(3) %s ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_common_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_common_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_common_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_common_param_1]; ; CHECK-NEXT: mov.u32 %r1, %ctaid.x; ; CHECK-NEXT: xor.b32 %r2, %r1, 1; ; CHECK-NEXT: isspacep.shared::cluster %p1, %rd1; ; CHECK-NEXT: mapa.u64 %rd3, %rd1, %r2; ; CHECK-NEXT: isspacep.shared::cluster %p2, %rd3; ; CHECK-NEXT: mapa.shared::cluster.u64 %rd4, %rd2, %r2; -; CHECK-NEXT: ld.shared::cluster.u32 %r3, [%rd4]; +; CHECK-NEXT: ld.shared::cluster.b32 %r3, [%rd4]; ; CHECK-NEXT: add.s32 %r4, %r3, 42; -; CHECK-NEXT: st.shared::cluster.u32 [%rd4], %r4; +; CHECK-NEXT: st.shared::cluster.b32 [%rd4], %r4; ; CHECK-NEXT: selp.b32 %r5, 1, 0, %p1; ; CHECK-NEXT: selp.b32 %r6, 1, 0, %p2; ; CHECK-NEXT: add.s32 %r7, %r5, %r6; @@ -64,7 +64,7 @@ define void @test_distributed_shared_cluster_float_atomic(ptr addrspace(7) %dsme ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_float_atomic_param_0]; ; CHECK-NEXT: mov.b16 %rs1, 0x3C00; ; CHECK-NEXT: atom.shared::cluster.add.noftz.f16 %rs2, [%rd1], %rs1; ; CHECK-NEXT: mov.b16 %rs3, 0x3F80; @@ -90,7 +90,7 @@ define void @test_distributed_shared_cluster_int_atomic(ptr addrspace(7) %dsmem_ ; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_int_atomic_param_0]; ; CHECK-NEXT: atom.shared::cluster.add.u32 %r1, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.add.u64 %rd2, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.exch.b32 %r2, [%rd1], 1; @@ -142,7 +142,7 @@ define void @test_distributed_shared_cluster_bitwise_atomic(ptr addrspace(7) %ds ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_distributed_shared_cluster_bitwise_atomic_param_0]; ; CHECK-NEXT: atom.shared::cluster.and.b32 %r1, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.and.b64 %rd2, [%rd1], 1; ; CHECK-NEXT: atom.shared::cluster.or.b32 %r2, [%rd1], 1; @@ -171,7 +171,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: .reg .b64 %rd<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_distributed_shared_cluster_cmpxchg_param_0]; ; CHECK-NEXT: atom.relaxed.shared::cluster.cas.b32 %r24, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r25, [%rd2], 1, 0; ; CHECK-NEXT: atom.acquire.shared::cluster.cas.b32 %r26, [%rd2], 1, 0; @@ -205,7 +205,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: not.b32 %r2, %r36; ; CHECK-NEXT: mov.b32 %r37, 1; ; CHECK-NEXT: shl.b32 %r3, %r37, %r1; -; CHECK-NEXT: ld.shared::cluster.u32 %r38, [%rd1]; +; CHECK-NEXT: ld.shared::cluster.b32 %r38, [%rd1]; ; CHECK-NEXT: and.b32 %r48, %r38, %r2; ; CHECK-NEXT: $L__BB4_1: // %partword.cmpxchg.loop33 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -220,7 +220,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: mov.b32 %r48, %r7; ; CHECK-NEXT: @%p2 bra $L__BB4_1; ; CHECK-NEXT: $L__BB4_3: // %partword.cmpxchg.end31 -; CHECK-NEXT: ld.shared::cluster.u32 %r40, [%rd1]; +; CHECK-NEXT: ld.shared::cluster.b32 %r40, [%rd1]; ; CHECK-NEXT: and.b32 %r49, %r40, %r2; ; CHECK-NEXT: $L__BB4_4: // %partword.cmpxchg.loop23 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -237,7 +237,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_6: // %partword.cmpxchg.end21 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.acq_rel.sys; -; CHECK-NEXT: ld.shared::cluster.u32 %r42, [%rd1]; +; CHECK-NEXT: ld.shared::cluster.b32 %r42, [%rd1]; ; CHECK-NEXT: and.b32 %r50, %r42, %r2; ; CHECK-NEXT: $L__BB4_7: // %partword.cmpxchg.loop13 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -253,7 +253,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: @%p6 bra $L__BB4_7; ; CHECK-NEXT: $L__BB4_9: // %partword.cmpxchg.end11 ; CHECK-NEXT: fence.acq_rel.sys; -; CHECK-NEXT: ld.shared::cluster.u32 %r44, [%rd1]; +; CHECK-NEXT: ld.shared::cluster.b32 %r44, [%rd1]; ; CHECK-NEXT: and.b32 %r51, %r44, %r2; ; CHECK-NEXT: $L__BB4_10: // %partword.cmpxchg.loop3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -270,7 +270,7 @@ define void @test_distributed_shared_cluster_cmpxchg(ptr addrspace(7) %dsmem_ptr ; CHECK-NEXT: $L__BB4_12: // %partword.cmpxchg.end1 ; CHECK-NEXT: fence.acq_rel.sys; ; CHECK-NEXT: fence.sc.sys; -; CHECK-NEXT: ld.shared::cluster.u32 %r46, [%rd1]; +; CHECK-NEXT: ld.shared::cluster.b32 %r46, [%rd1]; ; CHECK-NEXT: and.b32 %r52, %r46, %r2; ; CHECK-NEXT: $L__BB4_13: // %partword.cmpxchg.loop ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/NVPTX/div.ll b/llvm/test/CodeGen/NVPTX/div.ll index f8711e3a8359..bd8d9a35eed4 100644 --- a/llvm/test/CodeGen/NVPTX/div.ll +++ b/llvm/test/CodeGen/NVPTX/div.ll @@ -8,15 +8,15 @@ define float @div_full(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [div_full_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [div_full_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [div_full_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [div_full_param_1]; ; CHECK-NEXT: div.full.f32 %f3, %f1, %f2; ; CHECK-NEXT: mov.b32 %f4, 0f40400000; ; CHECK-NEXT: div.full.f32 %f5, %f3, %f4; ; CHECK-NEXT: div.full.ftz.f32 %f6, %f5, %f2; ; CHECK-NEXT: mov.b32 %f7, 0f40800000; ; CHECK-NEXT: div.full.ftz.f32 %f8, %f6, %f7; -; CHECK-NEXT: st.param.f32 [func_retval0], %f8; +; CHECK-NEXT: st.param.b32 [func_retval0], %f8; ; CHECK-NEXT: ret; %1 = call float @llvm.nvvm.div.full(float %a, float %b) %2 = call float @llvm.nvvm.div.full(float %1, float 3.0) diff --git a/llvm/test/CodeGen/NVPTX/dot-product.ll b/llvm/test/CodeGen/NVPTX/dot-product.ll index 984b2bb0d27d..6d634229b37a 100644 --- a/llvm/test/CodeGen/NVPTX/dot-product.ll +++ b/llvm/test/CodeGen/NVPTX/dot-product.ll @@ -15,9 +15,9 @@ define i32 @test_dp4a_u32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_u32_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_u32_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_u32_u32_param_2]; ; CHECK-NEXT: dp4a.u32.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -31,7 +31,7 @@ define i32 @test_dp4a_u32imm_u32imm(i32 %c) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32imm_u32imm_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_u32imm_u32imm_param_0]; ; CHECK-NEXT: mov.b32 %r2, 0; ; CHECK-NEXT: dp4a.u32.u32 %r3, %r2, %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -46,9 +46,9 @@ define i32 @test_dp4a_u32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_u32_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_u32_s32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_u32_s32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_u32_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_u32_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_u32_s32_param_2]; ; CHECK-NEXT: dp4a.u32.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -62,9 +62,9 @@ define i32 @test_dp4a_s32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_s32_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_s32_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_s32_u32_param_2]; ; CHECK-NEXT: dp4a.s32.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -78,9 +78,9 @@ define i32 @test_dp4a_s32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp4a_s32_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp4a_s32_s32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp4a_s32_s32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp4a_s32_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp4a_s32_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp4a_s32_s32_param_2]; ; CHECK-NEXT: dp4a.s32.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -99,9 +99,9 @@ define i32 @test_dp2a_lo_u32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_u32_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_u32_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_u32_u32_param_2]; ; CHECK-NEXT: dp2a.lo.u32.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -115,9 +115,9 @@ define i32 @test_dp2a_lo_u32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_u32_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_u32_s32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_u32_s32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_u32_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_u32_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_u32_s32_param_2]; ; CHECK-NEXT: dp2a.lo.u32.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -131,9 +131,9 @@ define i32 @test_dp2a_lo_s32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_s32_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_s32_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_s32_u32_param_2]; ; CHECK-NEXT: dp2a.lo.s32.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -147,9 +147,9 @@ define i32 @test_dp2a_lo_s32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_lo_s32_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_lo_s32_s32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_lo_s32_s32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_lo_s32_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_lo_s32_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_lo_s32_s32_param_2]; ; CHECK-NEXT: dp2a.lo.s32.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -163,9 +163,9 @@ define i32 @test_dp2a_hi_u32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_u32_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_u32_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_u32_u32_param_2]; ; CHECK-NEXT: dp2a.hi.u32.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -179,9 +179,9 @@ define i32 @test_dp2a_hi_u32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_u32_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_u32_s32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_u32_s32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_u32_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_u32_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_u32_s32_param_2]; ; CHECK-NEXT: dp2a.hi.u32.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -195,9 +195,9 @@ define i32 @test_dp2a_hi_s32_u32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_s32_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_s32_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_s32_u32_param_2]; ; CHECK-NEXT: dp2a.hi.s32.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -211,9 +211,9 @@ define i32 @test_dp2a_hi_s32_s32(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_dp2a_hi_s32_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_dp2a_hi_s32_s32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_dp2a_hi_s32_s32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_dp2a_hi_s32_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_dp2a_hi_s32_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_dp2a_hi_s32_s32_param_2]; ; CHECK-NEXT: dp2a.hi.s32.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll index 5d46c74157ab..f70831cc97ae 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic-stackalloc-regression.ll @@ -9,17 +9,17 @@ define void @foo(i64 %a, ptr %p0, ptr %p1) { ; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; ; CHECK-NEXT: add.s64 %rd2, %rd1, 7; ; CHECK-NEXT: and.b64 %rd3, %rd2, -8; ; CHECK-NEXT: alloca.u64 %rd4, %rd3, 16; ; CHECK-NEXT: cvta.local.u64 %rd4, %rd4; -; CHECK-NEXT: ld.param.u64 %rd5, [foo_param_1]; +; CHECK-NEXT: ld.param.b64 %rd5, [foo_param_1]; ; CHECK-NEXT: alloca.u64 %rd6, %rd3, 16; ; CHECK-NEXT: cvta.local.u64 %rd6, %rd6; -; CHECK-NEXT: ld.param.u64 %rd7, [foo_param_2]; -; CHECK-NEXT: st.u64 [%rd5], %rd4; -; CHECK-NEXT: st.u64 [%rd7], %rd6; +; CHECK-NEXT: ld.param.b64 %rd7, [foo_param_2]; +; CHECK-NEXT: st.b64 [%rd5], %rd4; +; CHECK-NEXT: st.b64 [%rd7], %rd6; ; CHECK-NEXT: ret; %b = alloca i8, i64 %a, align 16 %c = alloca i8, i64 %a, align 16 diff --git a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll index de19d2983f34..664569e3c525 100644 --- a/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll +++ b/llvm/test/CodeGen/NVPTX/dynamic_stackalloc.ll @@ -11,7 +11,7 @@ ; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc( ; CHECK-NOT: __local_depot -; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; +; CHECK-32: ld.param.b32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; ; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7; ; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8; ; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16; @@ -20,7 +20,7 @@ ; CHECK-32-NEXT: .param .b32 param0; ; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]]; -; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; +; CHECK-64: ld.param.b64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0]; ; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7; ; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8; ; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16; diff --git a/llvm/test/CodeGen/NVPTX/elect.ll b/llvm/test/CodeGen/NVPTX/elect.ll index 34b40ccdcbcb..93c30a9b0006 100644 --- a/llvm/test/CodeGen/NVPTX/elect.ll +++ b/llvm/test/CodeGen/NVPTX/elect.ll @@ -14,7 +14,7 @@ define {i32, i1} @elect_sync(i32 %mask) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [elect_sync_param_0]; ; CHECK-NEXT: elect.sync %r2|%p1, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: selp.b16 %rs1, -1, 0, %p1; @@ -51,7 +51,7 @@ define {i32, i1} @elect_sync_twice(i32 %mask) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [elect_sync_twice_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [elect_sync_twice_param_0]; ; CHECK-NEXT: elect.sync %r2|%p1, %r1; ; CHECK-NEXT: elect.sync %r3|%p2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; diff --git a/llvm/test/CodeGen/NVPTX/extloadv.ll b/llvm/test/CodeGen/NVPTX/extloadv.ll index c9d14efa3a00..3d861e69128b 100644 --- a/llvm/test/CodeGen/NVPTX/extloadv.ll +++ b/llvm/test/CodeGen/NVPTX/extloadv.ll @@ -4,7 +4,7 @@ define void @foo(ptr nocapture readonly %x_value, ptr nocapture %output) #0 { %1 = load <4 x float>, ptr %x_value, align 16 %2 = fpext <4 x float> %1 to <4 x double> -; CHECK-NOT: ld.v2.f32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]; +; CHECK-NOT: ld.v2.b32 {%fd{{[0-9]+}}, %fd{{[0-9]+}}}, [%rd{{[0-9]+}}]; ; CHECK: cvt.f64.f32 ; CHECK: cvt.f64.f32 ; CHECK: cvt.f64.f32 diff --git a/llvm/test/CodeGen/NVPTX/extractelement.ll b/llvm/test/CodeGen/NVPTX/extractelement.ll index bf4a41ae4605..79d80e6f8fa8 100644 --- a/llvm/test/CodeGen/NVPTX/extractelement.ll +++ b/llvm/test/CodeGen/NVPTX/extractelement.ll @@ -11,7 +11,7 @@ define i16 @test_v2i8(i16 %a) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [test_v2i8_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_v2i8_param_0]; ; CHECK-NEXT: cvt.s16.s8 %rs2, %rs1; ; CHECK-NEXT: shr.s16 %rs3, %rs1, 8; ; CHECK-NEXT: add.s16 %rs4, %rs2, %rs3; @@ -36,8 +36,8 @@ define i1 @test_v2i8_load(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_v2i8_load_param_0]; -; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_v2i8_load_param_0]; +; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: or.b16 %rs5, %rs1, %rs2; ; CHECK-NEXT: and.b16 %rs6, %rs5, 255; ; CHECK-NEXT: setp.eq.s16 %p1, %rs6, 0; @@ -59,7 +59,7 @@ define i16 @test_v4i8(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_param_0]; ; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r2; ; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8; @@ -95,7 +95,7 @@ define i32 @test_v4i8_s32(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_s32_param_0]; ; CHECK-NEXT: bfe.s32 %r2, %r1, 0, 8; ; CHECK-NEXT: bfe.s32 %r3, %r1, 8, 8; ; CHECK-NEXT: bfe.s32 %r4, %r1, 16, 8; @@ -126,7 +126,7 @@ define i32 @test_v4i8_u32(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_v4i8_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_v4i8_u32_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: bfe.u32 %r3, %r1, 8, 8; ; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; @@ -161,7 +161,7 @@ define i16 @test_v8i8(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_v8i8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_v8i8_param_0]; ; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd1; } ; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index bea9db03caf6..23fab2205786 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -220,7 +220,7 @@ define half @test_frem(half %a, half %b) #0 { ; CHECK-LABEL: test_store( ; CHECK-DAG: ld.param.b16 [[A:%rs[0-9]+]], [test_store_param_0]; -; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1]; +; CHECK-DAG: ld.param.b64 %[[PTR:rd[0-9]+]], [test_store_param_1]; ; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]]; ; CHECK-NEXT: ret; define void @test_store(half %a, ptr %b) #0 { @@ -229,7 +229,7 @@ define void @test_store(half %a, ptr %b) #0 { } ; CHECK-LABEL: test_load( -; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0]; +; CHECK: ld.param.b64 %[[PTR:rd[0-9]+]], [test_load_param_0]; ; CHECK-NEXT: ld.b16 [[R:%rs[0-9]+]], [%[[PTR]]]; ; CHECK-NEXT: st.param.b16 [func_retval0], [[R]]; ; CHECK-NEXT: ret; @@ -239,12 +239,12 @@ define half @test_load(ptr %a) #0 { } ; CHECK-LABEL: .visible .func test_halfp0a1( -; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0]; -; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1]; -; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] +; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0]; +; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1]; +; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.b8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.b8 [%[[TO]]+1], [[B1]] ; CHECK: ret define void @test_halfp0a1(ptr noalias readonly %from, ptr %to) { %1 = load half, ptr %from , align 1 @@ -357,8 +357,8 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { } ; CHECK-LABEL: test_select_cc_f32_f16( -; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0]; -; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1]; +; CHECK-DAG: ld.param.b32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1]; ; CHECK-DAG: ld.param.b16 [[C:%rs[0-9]+]], [test_select_cc_f32_f16_param_2]; ; CHECK-DAG: ld.param.b16 [[D:%rs[0-9]+]], [test_select_cc_f32_f16_param_3]; ; CHECK-F16-NOFTZ: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] @@ -367,7 +367,7 @@ define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { ; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; ; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] ; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.f32 [func_retval0], [[R]]; +; CHECK-NEXT: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { %cc = fcmp une half %c, %d @@ -377,8 +377,8 @@ define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { ; CHECK-LABEL: test_select_cc_f16_f32( ; CHECK-DAG: ld.param.b16 [[A:%rs[0-9]+]], [test_select_cc_f16_f32_param_0]; -; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2]; -; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3]; +; CHECK-DAG: ld.param.b32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3]; ; CHECK-NOFTZ-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] ; CHECK-F16-FTZ-DAG: setp.neu.ftz.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] ; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_select_cc_f16_f32_param_1]; @@ -619,17 +619,17 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 { ; CHECK-LABEL: test_br_cc( ; CHECK-DAG: ld.param.b16 [[A:%rs[0-9]+]], [test_br_cc_param_0]; ; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_br_cc_param_1]; -; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2]; -; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3]; +; CHECK-DAG: ld.param.b64 %[[C:rd[0-9]+]], [test_br_cc_param_2]; +; CHECK-DAG: ld.param.b64 %[[D:rd[0-9]+]], [test_br_cc_param_3]; ; CHECK-F16-NOFTZ: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] ; CHECK-F16-FTZ: setp.lt.ftz.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; ; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] ; CHECK-NEXT: @[[PRED]] bra [[LABEL:\$L__BB.*]]; -; CHECK: st.u32 [%[[C]]], +; CHECK: st.b32 [%[[C]]], ; CHECK: [[LABEL]]: -; CHECK: st.u32 [%[[D]]], +; CHECK: st.b32 [%[[D]]], ; CHECK: ret; define void @test_br_cc(half %a, half %b, ptr %p1, ptr %p2) #0 { %c = fcmp uge half %a, %b @@ -643,7 +643,7 @@ else: } ; CHECK-LABEL: test_phi( -; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0]; +; CHECK: ld.param.b64 %[[P1:rd[0-9]+]], [test_phi_param_0]; ; CHECK: ld.b16 {{%rs[0-9]+}}, [%[[P1]]]; ; CHECK: [[LOOP:\$L__BB[0-9_]+]]: ; CHECK: mov.b16 [[R:%rs[0-9]+]], [[AB:%rs[0-9]+]]; @@ -712,7 +712,7 @@ define i64 @test_fptoui_i64(half %a) #0 { } ; CHECK-LABEL: test_uitofp_i32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; +; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; ; CHECK: cvt.rn.f16.u32 [[R:%rs[0-9]+]], [[A]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; @@ -722,7 +722,7 @@ define half @test_uitofp_i32(i32 %a) #0 { } ; CHECK-LABEL: test_uitofp_i64( -; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; +; CHECK: ld.param.b64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; ; CHECK: cvt.rn.f16.u64 [[R:%rs[0-9]+]], [[A]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; @@ -732,7 +732,7 @@ define half @test_uitofp_i64(i64 %a) #0 { } ; CHECK-LABEL: test_sitofp_i32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; +; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; ; CHECK: cvt.rn.f16.s32 [[R:%rs[0-9]+]], [[A]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; @@ -742,7 +742,7 @@ define half @test_sitofp_i32(i32 %a) #0 { } ; CHECK-LABEL: test_sitofp_i64( -; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; +; CHECK: ld.param.b64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; ; CHECK: cvt.rn.f16.s64 [[R:%rs[0-9]+]], [[A]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; @@ -752,7 +752,7 @@ define half @test_sitofp_i64(i64 %a) #0 { } ; CHECK-LABEL: test_uitofp_i32_fadd( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0]; ; CHECK-DAG: cvt.rn.f16.u32 [[C:%rs[0-9]+]], [[A]]; ; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_uitofp_i32_fadd_param_1]; ; CHECK-F16-NOFTZ: add.rn.f16 [[R:%rs[0-9]+]], [[B]], [[C]]; @@ -770,7 +770,7 @@ define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { } ; CHECK-LABEL: test_sitofp_i32_fadd( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0]; ; CHECK-DAG: cvt.rn.f16.s32 [[C:%rs[0-9]+]], [[A]]; ; CHECK-DAG: ld.param.b16 [[B:%rs[0-9]+]], [test_sitofp_i32_fadd_param_1]; ; CHECK-F16-NOFTZ: add.rn.f16 [[R:%rs[0-9]+]], [[B]], [[C]]; @@ -788,7 +788,7 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { } ; CHECK-LABEL: test_fptrunc_float( -; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; +; CHECK: ld.param.b32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; ; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[A]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; @@ -798,7 +798,7 @@ define half @test_fptrunc_float(float %a) #0 { } ; CHECK-LABEL: test_fptrunc_double( -; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; +; CHECK: ld.param.b64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; ; CHECK: cvt.rn.f16.f64 [[R:%rs[0-9]+]], [[A]]; ; CHECK: st.param.b16 [func_retval0], [[R]]; ; CHECK: ret; @@ -811,7 +811,7 @@ define half @test_fptrunc_double(double %a) #0 { ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fpext_float_param_0]; ; CHECK-NOFTZ: cvt.f32.f16 [[R:%f[0-9]+]], [[A]]; ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[R:%f[0-9]+]], [[A]]; -; CHECK: st.param.f32 [func_retval0], [[R]]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK: ret; define float @test_fpext_float(half %a) #0 { %r = fpext half %a to float @@ -821,7 +821,7 @@ define float @test_fpext_float(half %a) #0 { ; CHECK-LABEL: test_fpext_double( ; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fpext_double_param_0]; ; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]]; -; CHECK: st.param.f64 [func_retval0], [[R]]; +; CHECK: st.param.b64 [func_retval0], [[R]]; ; CHECK: ret; define double @test_fpext_double(half %a) #0 { %r = fpext half %a to double @@ -840,7 +840,7 @@ define i16 @test_bitcast_halftoi16(half %a) #0 { } ; CHECK-LABEL: test_bitcast_i16tohalf( -; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; +; CHECK: ld.param.b16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; ; CHECK: st.param.b16 [func_retval0], [[AS]]; ; CHECK: ret; define half @test_bitcast_i16tohalf(i16 %a) #0 { @@ -1043,7 +1043,7 @@ define half @test_copysign(half %a, half %b) #0 { ; CHECK-LABEL: test_copysign_f32( ; CHECK-DAG: ld.param.b16 [[AH:%rs[0-9]+]], [test_copysign_f32_param_0]; -; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1]; +; CHECK-DAG: ld.param.b32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1]; ; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]]; ; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767; ; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; @@ -1059,7 +1059,7 @@ define half @test_copysign_f32(half %a, float %b) #0 { ; CHECK-LABEL: test_copysign_f64( ; CHECK-DAG: ld.param.b16 [[AH:%rs[0-9]+]], [test_copysign_f64_param_0]; -; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1]; +; CHECK-DAG: ld.param.b64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1]; ; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]]; ; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AH]], 32767; ; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808; @@ -1082,7 +1082,7 @@ define half @test_copysign_f64(half %a, double %b) #0 { ; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; ; CHECK-NOFTZ: cvt.f32.f16 [[XR:%f[0-9]+]], [[RX]]; ; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[XR:%f[0-9]+]], [[RX]]; -; CHECK: st.param.f32 [func_retval0], [[XR]]; +; CHECK: st.param.b32 [func_retval0], [[XR]]; ; CHECK: ret; define float @test_copysign_extended(half %a, half %b) #0 { %r = call half @llvm.copysign.f16(half %a, half %b) diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index e854e5a6e5aa..7fef947a0e59 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -82,7 +82,7 @@ define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; ; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; @@ -390,8 +390,8 @@ define void @test_ldst_v2f16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2f16_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2f16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2f16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2f16_param_0]; ; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: st.b32 [%rd2], %r1; ; CHECK-NEXT: ret; @@ -412,11 +412,11 @@ define void @test_ldst_v3f16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3f16_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3f16_param_0]; -; CHECK-NEXT: ld.u64 %rd3, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3f16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3f16_param_0]; +; CHECK-NEXT: ld.b64 %rd3, [%rd1]; ; CHECK-NEXT: { .reg .b32 tmp; mov.b64 {tmp, %r1}, %rd3; } -; CHECK-NEXT: st.u32 [%rd2], %rd3; +; CHECK-NEXT: st.b32 [%rd2], %rd3; ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } ; CHECK-NEXT: st.b16 [%rd2+4], %rs1; ; CHECK-NEXT: ret; @@ -432,8 +432,8 @@ define void @test_ldst_v4f16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4f16_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4f16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4f16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4f16_param_0]; ; CHECK-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4}; ; CHECK-NEXT: ret; @@ -449,8 +449,8 @@ define void @test_ldst_v8f16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8f16_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8f16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8f16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8f16_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -553,7 +553,7 @@ define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1]; @@ -626,14 +626,14 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-F16-NEXT: .reg .b32 %f<7>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1]; -; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0]; ; CHECK-F16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; ; CHECK-F16-NEXT: setp.neu.f16x2 %p1|%p2, %r1, %r2; ; CHECK-F16-NEXT: selp.f32 %f5, %f2, %f4, %p2; ; CHECK-F16-NEXT: selp.f32 %f6, %f1, %f3, %p1; -; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f6, %f5}; +; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%f6, %f5}; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_select_cc_f32_f16( @@ -644,8 +644,8 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: .reg .b32 %f<11>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f32_f16_param_1]; -; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f32_f16_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f32_f16_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f32_f16_param_0]; ; CHECK-NOF16-NEXT: ld.param.b32 %r2, [test_select_cc_f32_f16_param_3]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_select_cc_f32_f16_param_2]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r2; @@ -658,7 +658,7 @@ define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, ; CHECK-NOF16-NEXT: setp.neu.f32 %p2, %f8, %f7; ; CHECK-NOF16-NEXT: selp.f32 %f9, %f2, %f4, %p2; ; CHECK-NOF16-NEXT: selp.f32 %f10, %f1, %f3, %p1; -; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f10, %f9}; +; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%f10, %f9}; ; CHECK-NOF16-NEXT: ret; <2 x half> %c, <2 x half> %d) #0 { %cc = fcmp une <2 x half> %c, %d @@ -675,8 +675,8 @@ define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.f32 {%f3, %f4}, [test_select_cc_f16_f32_param_3]; -; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_select_cc_f16_f32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%f3, %f4}, [test_select_cc_f16_f32_param_3]; +; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_select_cc_f16_f32_param_2]; ; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_f16_f32_param_1]; ; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_f16_f32_param_0]; ; CHECK-NEXT: setp.neu.f32 %p1, %f1, %f3; @@ -1388,7 +1388,7 @@ define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_param_0]; ; CHECK-NEXT: cvt.rn.f16.u32 %rs1, %r2; ; CHECK-NEXT: cvt.rn.f16.u32 %rs2, %r1; ; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; @@ -1406,7 +1406,7 @@ define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_uitofp_2xi64_param_0]; ; CHECK-NEXT: cvt.rn.f16.u64 %rs1, %rd2; ; CHECK-NEXT: cvt.rn.f16.u64 %rs2, %rd1; ; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; @@ -1423,7 +1423,7 @@ define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_param_0]; ; CHECK-NEXT: cvt.rn.f16.s32 %rs1, %r2; ; CHECK-NEXT: cvt.rn.f16.s32 %rs2, %r1; ; CHECK-NEXT: mov.b32 %r3, {%rs2, %rs1}; @@ -1441,7 +1441,7 @@ define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_sitofp_2xi64_param_0]; ; CHECK-NEXT: cvt.rn.f16.s64 %rs1, %rd2; ; CHECK-NEXT: cvt.rn.f16.s64 %rs2, %rd1; ; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; @@ -1459,7 +1459,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-F16-NEXT: .reg .b32 %r<6>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; ; CHECK-F16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-F16-NEXT: cvt.rn.f16.u32 %rs1, %r2; ; CHECK-F16-NEXT: cvt.rn.f16.u32 %rs2, %r1; @@ -1475,7 +1475,7 @@ define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %f<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_uitofp_2xi32_fadd_param_0]; ; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_uitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.u32 %rs2, %r2; @@ -1503,7 +1503,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-F16-NEXT: .reg .b32 %r<6>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; ; CHECK-F16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; ; CHECK-F16-NEXT: cvt.rn.f16.s32 %rs1, %r2; ; CHECK-F16-NEXT: cvt.rn.f16.s32 %rs2, %r1; @@ -1519,7 +1519,7 @@ define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %f<7>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_sitofp_2xi32_fadd_param_0]; ; CHECK-NOF16-NEXT: ld.param.b32 %r3, [test_sitofp_2xi32_fadd_param_1]; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs1, %r1; ; CHECK-NOF16-NEXT: cvt.rn.f16.s32 %rs2, %r2; @@ -1548,7 +1548,7 @@ define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_fptrunc_2xfloat_param_0]; ; CHECK-NEXT: cvt.rn.f16.f32 %rs1, %f2; ; CHECK-NEXT: cvt.rn.f16.f32 %rs2, %f1; ; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; @@ -1566,7 +1566,7 @@ define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%fd1, %fd2}, [test_fptrunc_2xdouble_param_0]; ; CHECK-NEXT: cvt.rn.f16.f64 %rs1, %fd2; ; CHECK-NEXT: cvt.rn.f16.f64 %rs2, %fd1; ; CHECK-NEXT: mov.b32 %r1, {%rs2, %rs1}; @@ -1588,7 +1588,7 @@ define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1}; ; CHECK-NEXT: ret; %r = fpext <2 x half> %a to <2 x float> ret <2 x float> %r @@ -1606,7 +1606,7 @@ define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f64.f16 %fd1, %rs2; ; CHECK-NEXT: cvt.f64.f16 %fd2, %rs1; -; CHECK-NEXT: st.param.v2.f64 [func_retval0], {%fd2, %fd1}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%fd2, %fd1}; ; CHECK-NEXT: ret; %r = fpext <2 x half> %a to <2 x double> ret <2 x double> %r @@ -1619,7 +1619,7 @@ define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xhalf_to_2xi16_param_0]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <2 x half> %a to <2 x i16> @@ -1632,7 +1632,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xi16_to_2xhalf_param_0]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <2 x i16> %a to <2 x half> @@ -1646,7 +1646,7 @@ define <2 x half> @test_bitcast_float_to_2xhalf(float %a) #0 { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [test_bitcast_float_to_2xhalf_param_0]; ; CHECK-NEXT: mov.b32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -1661,9 +1661,9 @@ define float @test_bitcast_2xhalf_to_float(<2 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xhalf_to_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xhalf_to_float_param_0]; ; CHECK-NEXT: mov.b32 %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %f1; ; CHECK-NEXT: ret; %r = bitcast <2 x half> %a to float ret float %r @@ -1987,7 +1987,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-F16-NEXT: .reg .b32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs1, %f2; ; CHECK-F16-NEXT: cvt.rn.f16.f32 %rs2, %f1; @@ -2005,7 +2005,7 @@ define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b32 %f<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [test_copysign_f32_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b32 {%f1, %f2}, [test_copysign_f32_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f32_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; @@ -2034,7 +2034,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-F16-NEXT: .reg .b64 %fd<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1]; +; CHECK-F16-NEXT: ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1]; ; CHECK-F16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; ; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs1, %fd2; ; CHECK-F16-NEXT: cvt.rn.f16.f64 %rs2, %fd1; @@ -2053,7 +2053,7 @@ define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { ; CHECK-NOF16-NEXT: .reg .b64 %fd<3>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.v2.f64 {%fd1, %fd2}, [test_copysign_f64_param_1]; +; CHECK-NOF16-NEXT: ld.param.v2.b64 {%fd1, %fd2}, [test_copysign_f64_param_1]; ; CHECK-NOF16-NEXT: ld.param.b32 %r1, [test_copysign_f64_param_0]; ; CHECK-NOF16-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NOF16-NEXT: and.b16 %rs3, %rs2, 32767; @@ -2092,7 +2092,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-F16-NEXT: mov.b32 {%rs1, %rs2}, %r5; ; CHECK-F16-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-F16-NEXT: cvt.f32.f16 %f2, %rs1; -; CHECK-F16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-F16-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1}; ; CHECK-F16-NEXT: ret; ; ; CHECK-NOF16-LABEL: test_copysign_extended( @@ -2114,7 +2114,7 @@ define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { ; CHECK-NOF16-NEXT: or.b16 %rs10, %rs9, %rs8; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f1, %rs10; ; CHECK-NOF16-NEXT: cvt.f32.f16 %f2, %rs7; -; CHECK-NOF16-NEXT: st.param.v2.f32 [func_retval0], {%f2, %f1}; +; CHECK-NOF16-NEXT: st.param.v2.b32 [func_retval0], {%f2, %f1}; ; CHECK-NOF16-NEXT: ret; %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) %xr = fpext <2 x half> %r to <2 x float> @@ -2359,7 +2359,7 @@ define <2 x half> @test_sitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sitofp_2xi16_to_2xhalf_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rn.f16.s16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.s16 %rs4, %rs1; @@ -2377,7 +2377,7 @@ define <2 x half> @test_uitofp_2xi16_to_2xhalf(<2 x i16> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_uitofp_2xi16_to_2xhalf_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.rn.f16.u16 %rs3, %rs2; ; CHECK-NEXT: cvt.rn.f16.u16 %rs4, %rs1; diff --git a/llvm/test/CodeGen/NVPTX/f32-ex2.ll b/llvm/test/CodeGen/NVPTX/f32-ex2.ll index 2c5c8146fbf6..939782eccff5 100644 --- a/llvm/test/CodeGen/NVPTX/f32-ex2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-ex2.ll @@ -12,9 +12,9 @@ define float @ex2_float(float %0) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [ex2_float_param_0]; ; CHECK-NEXT: ex2.approx.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %res = call float @llvm.nvvm.ex2.approx.f(float %0) ret float %res @@ -27,9 +27,9 @@ define float @ex2_float_ftz(float %0) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [ex2_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [ex2_float_ftz_param_0]; ; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %res = call float @llvm.nvvm.ex2.approx.ftz.f(float %0) ret float %res diff --git a/llvm/test/CodeGen/NVPTX/f32-lg2.ll b/llvm/test/CodeGen/NVPTX/f32-lg2.ll index 9dac3083d6cb..2b101bc3af43 100644 --- a/llvm/test/CodeGen/NVPTX/f32-lg2.ll +++ b/llvm/test/CodeGen/NVPTX/f32-lg2.ll @@ -13,9 +13,9 @@ define float @lg2_float(float %0) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [lg2_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [lg2_float_param_0]; ; CHECK-NEXT: lg2.approx.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %res = call float @llvm.nvvm.lg2.approx.f(float %0) ret float %res @@ -28,9 +28,9 @@ define float @lg2_float_ftz(float %0) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [lg2_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [lg2_float_ftz_param_0]; ; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %res = call float @llvm.nvvm.lg2.approx.ftz.f(float %0) ret float %res diff --git a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll index d9c5a527b901..51434f7566c1 100644 --- a/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/fabs-intrinsics.ll @@ -21,9 +21,9 @@ define float @fabs_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fabs_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [fabs_float_param_0]; ; CHECK-NEXT: abs.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.fabs.f32(float %a) ret float %ret @@ -35,9 +35,9 @@ define float @fabs_float_ftz(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fabs_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [fabs_float_ftz_param_0]; ; CHECK-NEXT: abs.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %ret = call float @llvm.nvvm.fabs.ftz.f32(float %a) ret float %ret @@ -49,9 +49,9 @@ define double @fabs_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [fabs_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [fabs_double_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %ret = call double @llvm.nvvm.fabs.f64(double %a) ret double %ret diff --git a/llvm/test/CodeGen/NVPTX/fexp2.ll b/llvm/test/CodeGen/NVPTX/fexp2.ll index 4664d700209f..c8940d9ae2a9 100644 --- a/llvm/test/CodeGen/NVPTX/fexp2.ll +++ b/llvm/test/CodeGen/NVPTX/fexp2.ll @@ -16,9 +16,9 @@ define float @exp2_test(float %in) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.f32 %f1, [exp2_test_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [exp2_test_param_0]; ; CHECK-NEXT: ex2.approx.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; ; ; CHECK-FP16-LABEL: exp2_test( @@ -26,9 +26,9 @@ define float @exp2_test(float %in) { ; CHECK-FP16-NEXT: .reg .b32 %f<3>; ; CHECK-FP16-EMPTY: ; CHECK-FP16-NEXT: // %bb.0: // %entry -; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_test_param_0]; +; CHECK-FP16-NEXT: ld.param.b32 %f1, [exp2_test_param_0]; ; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1; -; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-FP16-NEXT: ret; ; ; CHECK-BF16-LABEL: exp2_test( @@ -36,9 +36,9 @@ define float @exp2_test(float %in) { ; CHECK-BF16-NEXT: .reg .b32 %f<3>; ; CHECK-BF16-EMPTY: ; CHECK-BF16-NEXT: // %bb.0: // %entry -; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_test_param_0]; +; CHECK-BF16-NEXT: ld.param.b32 %f1, [exp2_test_param_0]; ; CHECK-BF16-NEXT: ex2.approx.f32 %f2, %f1; -; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-BF16-NEXT: ret; entry: %exp2 = call float @llvm.exp2.f32(float %in) @@ -52,9 +52,9 @@ define float @exp2_ftz_test(float %in) #0 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [exp2_ftz_test_param_0]; ; CHECK-NEXT: ex2.approx.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; ; ; CHECK-FP16-LABEL: exp2_ftz_test( @@ -62,9 +62,9 @@ define float @exp2_ftz_test(float %in) #0 { ; CHECK-FP16-NEXT: .reg .b32 %f<3>; ; CHECK-FP16-EMPTY: ; CHECK-FP16-NEXT: // %bb.0: // %entry -; CHECK-FP16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0]; +; CHECK-FP16-NEXT: ld.param.b32 %f1, [exp2_ftz_test_param_0]; ; CHECK-FP16-NEXT: ex2.approx.ftz.f32 %f2, %f1; -; CHECK-FP16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-FP16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-FP16-NEXT: ret; ; ; CHECK-BF16-LABEL: exp2_ftz_test( @@ -72,9 +72,9 @@ define float @exp2_ftz_test(float %in) #0 { ; CHECK-BF16-NEXT: .reg .b32 %f<3>; ; CHECK-BF16-EMPTY: ; CHECK-BF16-NEXT: // %bb.0: // %entry -; CHECK-BF16-NEXT: ld.param.f32 %f1, [exp2_ftz_test_param_0]; +; CHECK-BF16-NEXT: ld.param.b32 %f1, [exp2_ftz_test_param_0]; ; CHECK-BF16-NEXT: ex2.approx.ftz.f32 %f2, %f1; -; CHECK-BF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-BF16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-BF16-NEXT: ret; entry: %exp2 = call float @llvm.exp2.f32(float %in) @@ -88,10 +88,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0]; ; CHECK-NEXT: ex2.approx.f32 %f3, %f2; ; CHECK-NEXT: ex2.approx.f32 %f4, %f1; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3}; ; CHECK-NEXT: ret; ; ; CHECK-FP16-LABEL: exp2_test_v( @@ -99,10 +99,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) { ; CHECK-FP16-NEXT: .reg .b32 %f<5>; ; CHECK-FP16-EMPTY: ; CHECK-FP16-NEXT: // %bb.0: // %entry -; CHECK-FP16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0]; +; CHECK-FP16-NEXT: ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0]; ; CHECK-FP16-NEXT: ex2.approx.f32 %f3, %f2; ; CHECK-FP16-NEXT: ex2.approx.f32 %f4, %f1; -; CHECK-FP16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-FP16-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3}; ; CHECK-FP16-NEXT: ret; ; ; CHECK-BF16-LABEL: exp2_test_v( @@ -110,10 +110,10 @@ define <2 x float> @exp2_test_v(<2 x float> %in) { ; CHECK-BF16-NEXT: .reg .b32 %f<5>; ; CHECK-BF16-EMPTY: ; CHECK-BF16-NEXT: // %bb.0: // %entry -; CHECK-BF16-NEXT: ld.param.v2.f32 {%f1, %f2}, [exp2_test_v_param_0]; +; CHECK-BF16-NEXT: ld.param.v2.b32 {%f1, %f2}, [exp2_test_v_param_0]; ; CHECK-BF16-NEXT: ex2.approx.f32 %f3, %f2; ; CHECK-BF16-NEXT: ex2.approx.f32 %f4, %f1; -; CHECK-BF16-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-BF16-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3}; ; CHECK-BF16-NEXT: ret; entry: %exp2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in) @@ -259,7 +259,7 @@ define bfloat @exp2_bf16_test(bfloat %in) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [exp2_bf16_test_param_0]; ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: ex2.approx.f32 %f2, %f1; @@ -282,7 +282,7 @@ define bfloat @exp2_bf16_test(bfloat %in) { ; CHECK-FP16-NEXT: .reg .b32 %f<3>; ; CHECK-FP16-EMPTY: ; CHECK-FP16-NEXT: // %bb.0: // %entry -; CHECK-FP16-NEXT: ld.param.u16 %r1, [exp2_bf16_test_param_0]; +; CHECK-FP16-NEXT: ld.param.b16 %r1, [exp2_bf16_test_param_0]; ; CHECK-FP16-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-FP16-NEXT: mov.b32 %f1, %r2; ; CHECK-FP16-NEXT: ex2.approx.f32 %f2, %f1; diff --git a/llvm/test/CodeGen/NVPTX/flo.ll b/llvm/test/CodeGen/NVPTX/flo.ll index bc7f765e40ab..fc4e30439caf 100644 --- a/llvm/test/CodeGen/NVPTX/flo.ll +++ b/llvm/test/CodeGen/NVPTX/flo.ll @@ -10,7 +10,7 @@ define i32 @flo_1(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [flo_1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [flo_1_param_0]; ; CHECK-NEXT: bfind.s32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -25,7 +25,7 @@ define i32 @flo_2(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [flo_2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [flo_2_param_0]; ; CHECK-NEXT: bfind.shiftamt.s32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -39,7 +39,7 @@ define i32 @flo_3(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [flo_3_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [flo_3_param_0]; ; CHECK-NEXT: bfind.u32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -54,7 +54,7 @@ define i32 @flo_4(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [flo_4_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [flo_4_param_0]; ; CHECK-NEXT: bfind.shiftamt.u32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -71,7 +71,7 @@ define i32 @flo_5(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [flo_5_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [flo_5_param_0]; ; CHECK-NEXT: bfind.s64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -87,7 +87,7 @@ define i32 @flo_6(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [flo_6_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [flo_6_param_0]; ; CHECK-NEXT: bfind.shiftamt.s64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -102,7 +102,7 @@ define i32 @flo_7(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [flo_7_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [flo_7_param_0]; ; CHECK-NEXT: bfind.u64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -118,7 +118,7 @@ define i32 @flo_8(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [flo_8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [flo_8_param_0]; ; CHECK-NEXT: bfind.shiftamt.u64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/flog2.ll b/llvm/test/CodeGen/NVPTX/flog2.ll index 4dfed3dd944a..d922e18edc16 100644 --- a/llvm/test/CodeGen/NVPTX/flog2.ll +++ b/llvm/test/CodeGen/NVPTX/flog2.ll @@ -10,9 +10,9 @@ define float @log2_test(float %in) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.f32 %f1, [log2_test_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [log2_test_param_0]; ; CHECK-NEXT: lg2.approx.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; entry: %log2 = call float @llvm.log2.f32(float %in) @@ -26,9 +26,9 @@ define float @log2_ftz_test(float %in) #0 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.f32 %f1, [log2_ftz_test_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [log2_ftz_test_param_0]; ; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; entry: %log2 = call float @llvm.log2.f32(float %in) @@ -42,10 +42,10 @@ define <2 x float> @log2_test_v(<2 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.v2.f32 {%f1, %f2}, [log2_test_v_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%f1, %f2}, [log2_test_v_param_0]; ; CHECK-NEXT: lg2.approx.f32 %f3, %f2; ; CHECK-NEXT: lg2.approx.f32 %f4, %f1; -; CHECK-NEXT: st.param.v2.f32 [func_retval0], {%f4, %f3}; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%f4, %f3}; ; CHECK-NEXT: ret; entry: %log2 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in) @@ -129,7 +129,7 @@ define bfloat @log2_bf16_test(bfloat %in) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_test_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [log2_bf16_test_param_0]; ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: lg2.approx.f32 %f2, %f1; @@ -158,7 +158,7 @@ define bfloat @log2_bf16_ftz_test(bfloat %in) #0 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u16 %r1, [log2_bf16_ftz_test_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [log2_bf16_ftz_test_param_0]; ; CHECK-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-NEXT: mov.b32 %f1, %r2; ; CHECK-NEXT: lg2.approx.ftz.f32 %f2, %f1; diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll index 9051a0bce14c..b971d2f237b4 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-contract.ll @@ -253,13 +253,13 @@ define bfloat @fma_bf16_expanded_unsafe_with_nans(bfloat %a, bfloat %b, bfloat % ; CHECK-SM70-NEXT: .reg .b32 %f<6>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_unsafe_with_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_unsafe_with_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_unsafe_with_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -317,13 +317,13 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: .reg .b32 %f<6>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -405,13 +405,13 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: .reg .b32 %f<10>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -493,13 +493,13 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) ; CHECK-SM70-NEXT: .reg .b32 %f<7>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll index 73f808f1e06e..d1081de000db 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-fma-intrinsic.ll @@ -187,13 +187,13 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: .reg .b32 %f<6>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -267,13 +267,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa ; CHECK-SM70-NEXT: .reg .b32 %f<9>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -348,13 +348,13 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-SM70-NEXT: .reg .b32 %f<7>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; diff --git a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll index b94fa5a24b50..05f7840dc3aa 100644 --- a/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll +++ b/llvm/test/CodeGen/NVPTX/fma-relu-instruction-flag.ll @@ -198,13 +198,13 @@ define bfloat @fma_bf16_expanded_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-NEXT: .reg .b32 %f<6>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -286,13 +286,13 @@ define bfloat @fma_bf16_expanded_no_nans_multiple_uses_of_fma(bfloat %a, bfloat ; CHECK-SM70-NEXT: .reg .b32 %f<10>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_no_nans_multiple_uses_of_fma_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -376,13 +376,13 @@ define bfloat @fma_bf16_expanded_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) ; CHECK-SM70-NEXT: .reg .b32 %f<7>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_expanded_maxnum_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_expanded_maxnum_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_expanded_maxnum_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1134,13 +1134,13 @@ define bfloat @fma_bf16_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-NEXT: .reg .b32 %f<6>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1214,13 +1214,13 @@ define bfloat @fma_bf16_no_nans_multiple_uses_of_fma(bfloat %a, bfloat %b, bfloa ; CHECK-SM70-NEXT: .reg .b32 %f<9>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_no_nans_multiple_uses_of_fma_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_no_nans_multiple_uses_of_fma_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_no_nans_multiple_uses_of_fma_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; @@ -1295,13 +1295,13 @@ define bfloat @fma_bf16_maxnum_no_nans(bfloat %a, bfloat %b, bfloat %c) { ; CHECK-SM70-NEXT: .reg .b32 %f<7>; ; CHECK-SM70-EMPTY: ; CHECK-SM70-NEXT: // %bb.0: -; CHECK-SM70-NEXT: ld.param.u16 %r1, [fma_bf16_maxnum_no_nans_param_2]; +; CHECK-SM70-NEXT: ld.param.b16 %r1, [fma_bf16_maxnum_no_nans_param_2]; ; CHECK-SM70-NEXT: shl.b32 %r2, %r1, 16; ; CHECK-SM70-NEXT: mov.b32 %f1, %r2; -; CHECK-SM70-NEXT: ld.param.u16 %r3, [fma_bf16_maxnum_no_nans_param_1]; +; CHECK-SM70-NEXT: ld.param.b16 %r3, [fma_bf16_maxnum_no_nans_param_1]; ; CHECK-SM70-NEXT: shl.b32 %r4, %r3, 16; ; CHECK-SM70-NEXT: mov.b32 %f2, %r4; -; CHECK-SM70-NEXT: ld.param.u16 %r5, [fma_bf16_maxnum_no_nans_param_0]; +; CHECK-SM70-NEXT: ld.param.b16 %r5, [fma_bf16_maxnum_no_nans_param_0]; ; CHECK-SM70-NEXT: shl.b32 %r6, %r5, 16; ; CHECK-SM70-NEXT: mov.b32 %f3, %r6; ; CHECK-SM70-NEXT: fma.rn.f32 %f4, %f3, %f2, %f1; diff --git a/llvm/test/CodeGen/NVPTX/fns.ll b/llvm/test/CodeGen/NVPTX/fns.ll index c51f9aee551c..b153e298bbff 100644 --- a/llvm/test/CodeGen/NVPTX/fns.ll +++ b/llvm/test/CodeGen/NVPTX/fns.ll @@ -5,9 +5,9 @@ declare i32 @llvm.nvvm.fns(i32, i32, i32) ; CHECK-LABEL: .func{{.*}}fns define i32 @fns(i32 %mask, i32 %base, i32 %offset) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [fns_param_0]; - ; CHECK: ld.param.u32 [[BASE:%r[0-9]+]], [fns_param_1]; - ; CHECK: ld.param.u32 [[OFFSET:%r[0-9]+]], [fns_param_2]; + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]], [fns_param_0]; + ; CHECK: ld.param.b32 [[BASE:%r[0-9]+]], [fns_param_1]; + ; CHECK: ld.param.b32 [[OFFSET:%r[0-9]+]], [fns_param_2]; ; CHECK: fns.b32 {{%r[0-9]+}}, [[MASK]], [[BASE]], [[OFFSET]]; %r0 = call i32 @llvm.nvvm.fns(i32 %mask, i32 %base, i32 %offset); diff --git a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll index 80ae8aac3911..d253df5ed1b9 100644 --- a/llvm/test/CodeGen/NVPTX/forward-ld-param.ll +++ b/llvm/test/CodeGen/NVPTX/forward-ld-param.ll @@ -10,7 +10,7 @@ define i32 @test_ld_param_const(ptr byval(i32) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_const_param_0+4]; +; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_const_param_0+4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %p2 = getelementptr i32, ptr %a, i32 1 @@ -28,7 +28,7 @@ define i32 @test_ld_param_non_const(ptr byval([10 x i32]) %a, i32 %b) { ; CHECK-NEXT: mov.b64 %rd1, test_ld_param_non_const_param_0; ; CHECK-NEXT: ld.param.s32 %rd2, [test_ld_param_non_const_param_1]; ; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; -; CHECK-NEXT: ld.local.u32 %r1, [%rd3]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd3]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %p2 = getelementptr i8, ptr %a, i32 %b @@ -68,7 +68,7 @@ define void @test_ld_param_byval(ptr byval(i32) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_ld_param_byval_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_ld_param_byval_param_0]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0], %r1; @@ -91,9 +91,9 @@ define i32 @test_modify_param(ptr byval([10 x i32]) %a, i32 %b, i32 %c ) { ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: mov.b64 %rd1, test_modify_param_param_0; -; CHECK-NEXT: ld.param.u32 %r1, [test_modify_param_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [test_modify_param_param_2]; -; CHECK-NEXT: st.local.u32 [%rd1+2], %r1; +; CHECK-NEXT: ld.param.b32 %r1, [test_modify_param_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_modify_param_param_2]; +; CHECK-NEXT: st.local.b32 [%rd1+2], %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; %p2 = getelementptr i8, ptr %a, i32 2 @@ -110,16 +110,16 @@ define i32 @test_multi_block(ptr byval([10 x i32]) %a, i1 %p) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_multi_block_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_multi_block_param_1]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; ; CHECK-NEXT: not.pred %p2, %p1; ; CHECK-NEXT: @%p2 bra $L__BB5_2; ; CHECK-NEXT: // %bb.1: // %if -; CHECK-NEXT: ld.param.u32 %r4, [test_multi_block_param_0+4]; +; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+4]; ; CHECK-NEXT: bra.uni $L__BB5_3; ; CHECK-NEXT: $L__BB5_2: // %else -; CHECK-NEXT: ld.param.u32 %r4, [test_multi_block_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r4, [test_multi_block_param_0+8]; ; CHECK-NEXT: $L__BB5_3: // %end ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/fp-contract.ll b/llvm/test/CodeGen/NVPTX/fp-contract.ll index bd559ea157fe..89a402db8e42 100644 --- a/llvm/test/CodeGen/NVPTX/fp-contract.ll +++ b/llvm/test/CodeGen/NVPTX/fp-contract.ll @@ -18,11 +18,11 @@ define float @t0(float %a, float %b, float %c) { ; FAST-NEXT: .reg .b32 %f<5>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f32 %f1, [t0_param_0]; -; FAST-NEXT: ld.param.f32 %f2, [t0_param_1]; -; FAST-NEXT: ld.param.f32 %f3, [t0_param_2]; +; FAST-NEXT: ld.param.b32 %f1, [t0_param_0]; +; FAST-NEXT: ld.param.b32 %f2, [t0_param_1]; +; FAST-NEXT: ld.param.b32 %f3, [t0_param_2]; ; FAST-NEXT: fma.rn.f32 %f4, %f1, %f2, %f3; -; FAST-NEXT: st.param.f32 [func_retval0], %f4; +; FAST-NEXT: st.param.b32 [func_retval0], %f4; ; FAST-NEXT: ret; ; ; DEFAULT-LABEL: t0( @@ -30,12 +30,12 @@ define float @t0(float %a, float %b, float %c) { ; DEFAULT-NEXT: .reg .b32 %f<6>; ; DEFAULT-EMPTY: ; DEFAULT-NEXT: // %bb.0: -; DEFAULT-NEXT: ld.param.f32 %f1, [t0_param_0]; -; DEFAULT-NEXT: ld.param.f32 %f2, [t0_param_1]; +; DEFAULT-NEXT: ld.param.b32 %f1, [t0_param_0]; +; DEFAULT-NEXT: ld.param.b32 %f2, [t0_param_1]; ; DEFAULT-NEXT: mul.rn.f32 %f3, %f1, %f2; -; DEFAULT-NEXT: ld.param.f32 %f4, [t0_param_2]; +; DEFAULT-NEXT: ld.param.b32 %f4, [t0_param_2]; ; DEFAULT-NEXT: add.rn.f32 %f5, %f3, %f4; -; DEFAULT-NEXT: st.param.f32 [func_retval0], %f5; +; DEFAULT-NEXT: st.param.b32 [func_retval0], %f5; ; DEFAULT-NEXT: ret; %v0 = fmul float %a, %b %v1 = fadd float %v0, %c @@ -50,12 +50,12 @@ define float @t1(float %a, float %b) { ; FAST-NEXT: .reg .b32 %f<6>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f32 %f1, [t1_param_0]; -; FAST-NEXT: ld.param.f32 %f2, [t1_param_1]; +; FAST-NEXT: ld.param.b32 %f1, [t1_param_0]; +; FAST-NEXT: ld.param.b32 %f2, [t1_param_1]; ; FAST-NEXT: add.f32 %f3, %f1, %f2; ; FAST-NEXT: sub.f32 %f4, %f1, %f2; ; FAST-NEXT: mul.f32 %f5, %f3, %f4; -; FAST-NEXT: st.param.f32 [func_retval0], %f5; +; FAST-NEXT: st.param.b32 [func_retval0], %f5; ; FAST-NEXT: ret; ; ; DEFAULT-LABEL: t1( @@ -63,12 +63,12 @@ define float @t1(float %a, float %b) { ; DEFAULT-NEXT: .reg .b32 %f<6>; ; DEFAULT-EMPTY: ; DEFAULT-NEXT: // %bb.0: -; DEFAULT-NEXT: ld.param.f32 %f1, [t1_param_0]; -; DEFAULT-NEXT: ld.param.f32 %f2, [t1_param_1]; +; DEFAULT-NEXT: ld.param.b32 %f1, [t1_param_0]; +; DEFAULT-NEXT: ld.param.b32 %f2, [t1_param_1]; ; DEFAULT-NEXT: add.rn.f32 %f3, %f1, %f2; ; DEFAULT-NEXT: sub.rn.f32 %f4, %f1, %f2; ; DEFAULT-NEXT: mul.rn.f32 %f5, %f3, %f4; -; DEFAULT-NEXT: st.param.f32 [func_retval0], %f5; +; DEFAULT-NEXT: st.param.b32 [func_retval0], %f5; ; DEFAULT-NEXT: ret; %v1 = fadd float %a, %b %v2 = fsub float %a, %b @@ -84,12 +84,12 @@ define float @t2(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [t2_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [t2_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [t2_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [t2_param_1]; ; CHECK-NEXT: add.f32 %f3, %f1, %f2; ; CHECK-NEXT: sub.f32 %f4, %f1, %f2; ; CHECK-NEXT: mul.f32 %f5, %f3, %f4; -; CHECK-NEXT: st.param.f32 [func_retval0], %f5; +; CHECK-NEXT: st.param.b32 [func_retval0], %f5; ; CHECK-NEXT: ret; %v1 = fadd contract float %a, %b %v2 = fsub contract float %a, %b @@ -104,11 +104,11 @@ define float @t3(float %a, float %b, float %c) { ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [t3_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [t3_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [t3_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [t3_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [t3_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [t3_param_2]; ; CHECK-NEXT: fma.rn.f32 %f4, %f1, %f2, %f3; -; CHECK-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NEXT: ret; %v0 = fmul contract float %a, %b %v1 = fadd contract float %v0, %c diff --git a/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll b/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll index fa8cc6e23b6b..d40f514acd40 100644 --- a/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll +++ b/llvm/test/CodeGen/NVPTX/fp128-storage-type.ll @@ -10,7 +10,7 @@ define fp128 @identity(fp128 %x) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [identity_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [identity_param_0]; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; ; CHECK-NEXT: ret; ret fp128 %x @@ -22,10 +22,10 @@ define void @load_store(ptr %in, ptr %out) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [load_store_param_0]; -; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.param.u64 %rd4, [load_store_param_1]; -; CHECK-NEXT: st.v2.u64 [%rd4], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [load_store_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd4, [load_store_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd4], {%rd2, %rd3}; ; CHECK-NEXT: ret; %val = load fp128, ptr %in store fp128 %val, ptr %out @@ -38,7 +38,7 @@ define void @call(fp128 %x) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [call_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [call_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v2.b64 [param0], {%rd1, %rd2}; diff --git a/llvm/test/CodeGen/NVPTX/frem.ll b/llvm/test/CodeGen/NVPTX/frem.ll index 4077f6d1eb21..c0658f85205e 100644 --- a/llvm/test/CodeGen/NVPTX/frem.ll +++ b/llvm/test/CodeGen/NVPTX/frem.ll @@ -54,13 +54,13 @@ define float @frem_f32(float %a, float %b) { ; FAST-NEXT: .reg .b32 %f<7>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f32 %f1, [frem_f32_param_0]; -; FAST-NEXT: ld.param.f32 %f2, [frem_f32_param_1]; +; FAST-NEXT: ld.param.b32 %f1, [frem_f32_param_0]; +; FAST-NEXT: ld.param.b32 %f2, [frem_f32_param_1]; ; FAST-NEXT: div.approx.f32 %f3, %f1, %f2; ; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3; ; FAST-NEXT: neg.f32 %f5, %f4; ; FAST-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1; -; FAST-NEXT: st.param.f32 [func_retval0], %f6; +; FAST-NEXT: st.param.b32 [func_retval0], %f6; ; FAST-NEXT: ret; ; ; NORMAL-LABEL: frem_f32( @@ -69,15 +69,15 @@ define float @frem_f32(float %a, float %b) { ; NORMAL-NEXT: .reg .b32 %f<8>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_param_0]; -; NORMAL-NEXT: ld.param.f32 %f2, [frem_f32_param_1]; +; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_param_0]; +; NORMAL-NEXT: ld.param.b32 %f2, [frem_f32_param_1]; ; NORMAL-NEXT: div.rn.f32 %f3, %f1, %f2; ; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3; ; NORMAL-NEXT: neg.f32 %f5, %f4; ; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1; ; NORMAL-NEXT: testp.infinite.f32 %p1, %f2; ; NORMAL-NEXT: selp.f32 %f7, %f1, %f6, %p1; -; NORMAL-NEXT: st.param.f32 [func_retval0], %f7; +; NORMAL-NEXT: st.param.b32 [func_retval0], %f7; ; NORMAL-NEXT: ret; %r = frem float %a, %b ret float %r @@ -89,13 +89,13 @@ define double @frem_f64(double %a, double %b) { ; FAST-NEXT: .reg .b64 %fd<7>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f64 %fd1, [frem_f64_param_0]; -; FAST-NEXT: ld.param.f64 %fd2, [frem_f64_param_1]; +; FAST-NEXT: ld.param.b64 %fd1, [frem_f64_param_0]; +; FAST-NEXT: ld.param.b64 %fd2, [frem_f64_param_1]; ; FAST-NEXT: div.rn.f64 %fd3, %fd1, %fd2; ; FAST-NEXT: cvt.rzi.f64.f64 %fd4, %fd3; ; FAST-NEXT: neg.f64 %fd5, %fd4; ; FAST-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1; -; FAST-NEXT: st.param.f64 [func_retval0], %fd6; +; FAST-NEXT: st.param.b64 [func_retval0], %fd6; ; FAST-NEXT: ret; ; ; NORMAL-LABEL: frem_f64( @@ -104,15 +104,15 @@ define double @frem_f64(double %a, double %b) { ; NORMAL-NEXT: .reg .b64 %fd<8>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.f64 %fd1, [frem_f64_param_0]; -; NORMAL-NEXT: ld.param.f64 %fd2, [frem_f64_param_1]; +; NORMAL-NEXT: ld.param.b64 %fd1, [frem_f64_param_0]; +; NORMAL-NEXT: ld.param.b64 %fd2, [frem_f64_param_1]; ; NORMAL-NEXT: div.rn.f64 %fd3, %fd1, %fd2; ; NORMAL-NEXT: cvt.rzi.f64.f64 %fd4, %fd3; ; NORMAL-NEXT: neg.f64 %fd5, %fd4; ; NORMAL-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1; ; NORMAL-NEXT: testp.infinite.f64 %p1, %fd2; ; NORMAL-NEXT: selp.f64 %fd7, %fd1, %fd6, %p1; -; NORMAL-NEXT: st.param.f64 [func_retval0], %fd7; +; NORMAL-NEXT: st.param.b64 [func_retval0], %fd7; ; NORMAL-NEXT: ret; %r = frem double %a, %b ret double %r @@ -164,13 +164,13 @@ define float @frem_f32_ninf(float %a, float %b) { ; FAST-NEXT: .reg .b32 %f<7>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f32 %f1, [frem_f32_ninf_param_0]; -; FAST-NEXT: ld.param.f32 %f2, [frem_f32_ninf_param_1]; +; FAST-NEXT: ld.param.b32 %f1, [frem_f32_ninf_param_0]; +; FAST-NEXT: ld.param.b32 %f2, [frem_f32_ninf_param_1]; ; FAST-NEXT: div.approx.f32 %f3, %f1, %f2; ; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3; ; FAST-NEXT: neg.f32 %f5, %f4; ; FAST-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1; -; FAST-NEXT: st.param.f32 [func_retval0], %f6; +; FAST-NEXT: st.param.b32 [func_retval0], %f6; ; FAST-NEXT: ret; ; ; NORMAL-LABEL: frem_f32_ninf( @@ -178,13 +178,13 @@ define float @frem_f32_ninf(float %a, float %b) { ; NORMAL-NEXT: .reg .b32 %f<7>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_ninf_param_0]; -; NORMAL-NEXT: ld.param.f32 %f2, [frem_f32_ninf_param_1]; +; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_ninf_param_0]; +; NORMAL-NEXT: ld.param.b32 %f2, [frem_f32_ninf_param_1]; ; NORMAL-NEXT: div.rn.f32 %f3, %f1, %f2; ; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3; ; NORMAL-NEXT: neg.f32 %f5, %f4; ; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f2, %f1; -; NORMAL-NEXT: st.param.f32 [func_retval0], %f6; +; NORMAL-NEXT: st.param.b32 [func_retval0], %f6; ; NORMAL-NEXT: ret; %r = frem ninf float %a, %b ret float %r @@ -196,13 +196,13 @@ define double @frem_f64_ninf(double %a, double %b) { ; FAST-NEXT: .reg .b64 %fd<7>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f64 %fd1, [frem_f64_ninf_param_0]; -; FAST-NEXT: ld.param.f64 %fd2, [frem_f64_ninf_param_1]; +; FAST-NEXT: ld.param.b64 %fd1, [frem_f64_ninf_param_0]; +; FAST-NEXT: ld.param.b64 %fd2, [frem_f64_ninf_param_1]; ; FAST-NEXT: div.rn.f64 %fd3, %fd1, %fd2; ; FAST-NEXT: cvt.rzi.f64.f64 %fd4, %fd3; ; FAST-NEXT: neg.f64 %fd5, %fd4; ; FAST-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1; -; FAST-NEXT: st.param.f64 [func_retval0], %fd6; +; FAST-NEXT: st.param.b64 [func_retval0], %fd6; ; FAST-NEXT: ret; ; ; NORMAL-LABEL: frem_f64_ninf( @@ -210,13 +210,13 @@ define double @frem_f64_ninf(double %a, double %b) { ; NORMAL-NEXT: .reg .b64 %fd<7>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.f64 %fd1, [frem_f64_ninf_param_0]; -; NORMAL-NEXT: ld.param.f64 %fd2, [frem_f64_ninf_param_1]; +; NORMAL-NEXT: ld.param.b64 %fd1, [frem_f64_ninf_param_0]; +; NORMAL-NEXT: ld.param.b64 %fd2, [frem_f64_ninf_param_1]; ; NORMAL-NEXT: div.rn.f64 %fd3, %fd1, %fd2; ; NORMAL-NEXT: cvt.rzi.f64.f64 %fd4, %fd3; ; NORMAL-NEXT: neg.f64 %fd5, %fd4; ; NORMAL-NEXT: fma.rn.f64 %fd6, %fd5, %fd2, %fd1; -; NORMAL-NEXT: st.param.f64 [func_retval0], %fd6; +; NORMAL-NEXT: st.param.b64 [func_retval0], %fd6; ; NORMAL-NEXT: ret; %r = frem ninf double %a, %b ret double %r @@ -228,11 +228,11 @@ define float @frem_f32_imm1(float %a) { ; FAST-NEXT: .reg .b32 %f<5>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f32 %f1, [frem_f32_imm1_param_0]; +; FAST-NEXT: ld.param.b32 %f1, [frem_f32_imm1_param_0]; ; FAST-NEXT: mul.f32 %f2, %f1, 0f3E124925; ; FAST-NEXT: cvt.rzi.f32.f32 %f3, %f2; ; FAST-NEXT: fma.rn.f32 %f4, %f3, 0fC0E00000, %f1; -; FAST-NEXT: st.param.f32 [func_retval0], %f4; +; FAST-NEXT: st.param.b32 [func_retval0], %f4; ; FAST-NEXT: ret; ; ; NORMAL-LABEL: frem_f32_imm1( @@ -240,11 +240,11 @@ define float @frem_f32_imm1(float %a) { ; NORMAL-NEXT: .reg .b32 %f<5>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_imm1_param_0]; +; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_imm1_param_0]; ; NORMAL-NEXT: div.rn.f32 %f2, %f1, 0f40E00000; ; NORMAL-NEXT: cvt.rzi.f32.f32 %f3, %f2; ; NORMAL-NEXT: fma.rn.f32 %f4, %f3, 0fC0E00000, %f1; -; NORMAL-NEXT: st.param.f32 [func_retval0], %f4; +; NORMAL-NEXT: st.param.b32 [func_retval0], %f4; ; NORMAL-NEXT: ret; %r = frem float %a, 7.0 ret float %r @@ -256,13 +256,13 @@ define float @frem_f32_imm2(float %a) { ; FAST-NEXT: .reg .b32 %f<7>; ; FAST-EMPTY: ; FAST-NEXT: // %bb.0: -; FAST-NEXT: ld.param.f32 %f1, [frem_f32_imm2_param_0]; +; FAST-NEXT: ld.param.b32 %f1, [frem_f32_imm2_param_0]; ; FAST-NEXT: mov.b32 %f2, 0f40E00000; ; FAST-NEXT: div.approx.f32 %f3, %f2, %f1; ; FAST-NEXT: cvt.rzi.f32.f32 %f4, %f3; ; FAST-NEXT: neg.f32 %f5, %f4; ; FAST-NEXT: fma.rn.f32 %f6, %f5, %f1, 0f40E00000; -; FAST-NEXT: st.param.f32 [func_retval0], %f6; +; FAST-NEXT: st.param.b32 [func_retval0], %f6; ; FAST-NEXT: ret; ; ; NORMAL-LABEL: frem_f32_imm2( @@ -271,7 +271,7 @@ define float @frem_f32_imm2(float %a) { ; NORMAL-NEXT: .reg .b32 %f<8>; ; NORMAL-EMPTY: ; NORMAL-NEXT: // %bb.0: -; NORMAL-NEXT: ld.param.f32 %f1, [frem_f32_imm2_param_0]; +; NORMAL-NEXT: ld.param.b32 %f1, [frem_f32_imm2_param_0]; ; NORMAL-NEXT: mov.b32 %f2, 0f40E00000; ; NORMAL-NEXT: div.rn.f32 %f3, %f2, %f1; ; NORMAL-NEXT: cvt.rzi.f32.f32 %f4, %f3; @@ -279,7 +279,7 @@ define float @frem_f32_imm2(float %a) { ; NORMAL-NEXT: fma.rn.f32 %f6, %f5, %f1, 0f40E00000; ; NORMAL-NEXT: testp.infinite.f32 %p1, %f1; ; NORMAL-NEXT: selp.f32 %f7, 0f40E00000, %f6, %p1; -; NORMAL-NEXT: st.param.f32 [func_retval0], %f7; +; NORMAL-NEXT: st.param.b32 [func_retval0], %f7; ; NORMAL-NEXT: ret; %r = frem float 7.0, %a ret float %r diff --git a/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll index 6f1532708f28..e06cf0fc4d48 100644 --- a/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll +++ b/llvm/test/CodeGen/NVPTX/funnel-shift-clamp.ll @@ -13,9 +13,9 @@ define i32 @fshr_clamp_r(i32 %hi, i32 %lo, i32 %n) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [fshr_clamp_r_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [fshr_clamp_r_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [fshr_clamp_r_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [fshr_clamp_r_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [fshr_clamp_r_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [fshr_clamp_r_param_2]; ; CHECK-NEXT: shf.r.clamp.b32 %r4, %r2, %r1, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -29,9 +29,9 @@ define i32 @fshl_clamp_r(i32 %hi, i32 %lo, i32 %n) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [fshl_clamp_r_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [fshl_clamp_r_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [fshl_clamp_r_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [fshl_clamp_r_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [fshl_clamp_r_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [fshl_clamp_r_param_2]; ; CHECK-NEXT: shf.l.clamp.b32 %r4, %r2, %r1, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -45,8 +45,8 @@ define i32 @fshr_clamp_i(i32 %hi, i32 %lo) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [fshr_clamp_i_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [fshr_clamp_i_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [fshr_clamp_i_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [fshr_clamp_i_param_1]; ; CHECK-NEXT: shf.r.clamp.b32 %r3, %r2, %r1, 3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -60,8 +60,8 @@ define i32 @fshl_clamp_i(i32 %hi, i32 %lo) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [fshl_clamp_i_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [fshl_clamp_i_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [fshl_clamp_i_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [fshl_clamp_i_param_1]; ; CHECK-NEXT: shf.l.clamp.b32 %r3, %r2, %r1, 3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll index 2b6631154e38..04d8dbfcafb3 100644 --- a/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll +++ b/llvm/test/CodeGen/NVPTX/generic-to-nvvm.ll @@ -15,9 +15,9 @@ target triple = "nvptx-nvidia-cuda" define ptx_kernel void @foo(ptr %a, ptr %b) { ; Expect one load -- @myconst isn't loaded from, because we know its value ; statically. -; CHECK: ld.global.u32 -; CHECK: st.global.u32 -; CHECK: st.global.u32 +; CHECK: ld.global.b32 +; CHECK: st.global.b32 +; CHECK: st.global.b32 %ld1 = load i32, ptr @myglobal %ld2 = load i32, ptr @myconst store i32 %ld1, ptr %a diff --git a/llvm/test/CodeGen/NVPTX/globals_lowering.ll b/llvm/test/CodeGen/NVPTX/globals_lowering.ll index b0b7aeb0900a..d94e47fe3ba6 100644 --- a/llvm/test/CodeGen/NVPTX/globals_lowering.ll +++ b/llvm/test/CodeGen/NVPTX/globals_lowering.ll @@ -7,10 +7,10 @@ ; CHK-LABEL: foo define void @foo(float %f) { entry: - ; CHK: ld.shared.f32 %{{[a-zA-Z0-9]+}}, [Gbl+8]; + ; CHK: ld.shared.b32 %{{[a-zA-Z0-9]+}}, [Gbl+8]; %0 = load float, ptr addrspace(3) getelementptr inbounds ([1024 x %MyStruct], ptr addrspace(3) @Gbl, i32 0, i32 0, i32 2) %add = fadd float %0, %f - ; CHK: st.shared.f32 [Gbl+8], %{{[a-zA-Z0-9]+}}; + ; CHK: st.shared.b32 [Gbl+8], %{{[a-zA-Z0-9]+}}; store float %add, ptr addrspace(3) getelementptr inbounds ([1024 x %MyStruct], ptr addrspace(3) @Gbl, i32 0, i32 0, i32 2) ret void } diff --git a/llvm/test/CodeGen/NVPTX/half.ll b/llvm/test/CodeGen/NVPTX/half.ll index 1b53e246ecd1..a3ccf6e565b4 100644 --- a/llvm/test/CodeGen/NVPTX/half.ll +++ b/llvm/test/CodeGen/NVPTX/half.ll @@ -26,8 +26,8 @@ define void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addrspace(1) %out) define void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; CHECK-LABEL: @test_bitcast_to_half -; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}] -; CHECK: st.global.u16 [{{%rd[0-9]+}}], [[TMP]] +; CHECK: ld.global.b16 [[TMP:%rs[0-9]+]], [{{%rd[0-9]+}}] +; CHECK: st.global.b16 [{{%rd[0-9]+}}], [[TMP]] %val = load i16, ptr addrspace(1) %in %val_fp = bitcast i16 %val to half store half %val_fp, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll index 44ac46db254a..bb88d1f2755c 100644 --- a/llvm/test/CodeGen/NVPTX/i1-ext-load.ll +++ b/llvm/test/CodeGen/NVPTX/i1-ext-load.ll @@ -11,15 +11,15 @@ define ptx_kernel void @foo(ptr noalias readonly %ptr, ptr noalias %retval) { ; CHECK: .reg .b32 %r<4>; ; CHECK: .reg .b64 %rd<5>; ; CHECK-EMPTY: -; CHECK: ld.param.u64 %rd1, [foo_param_0]; +; CHECK: ld.param.b64 %rd1, [foo_param_0]; ; CHECK: cvta.to.global.u64 %rd2, %rd1; -; CHECK: ld.param.u64 %rd3, [foo_param_1]; +; CHECK: ld.param.b64 %rd3, [foo_param_1]; ; CHECK: cvta.to.global.u64 %rd4, %rd3; -; CHECK: ld.global.nc.u8 %rs1, [%rd2]; +; CHECK: ld.global.nc.b8 %rs1, [%rd2]; ; CHECK: cvt.u32.u8 %r1, %rs1; ; CHECK: add.s32 %r2, %r1, 1; ; CHECK: and.b32 %r3, %r2, 1; -; CHECK: st.global.u32 [%rd4], %r3; +; CHECK: st.global.b32 [%rd4], %r3; ; CHECK: ret; %ld = load i1, ptr %ptr, align 1 %zext = zext i1 %ld to i32 diff --git a/llvm/test/CodeGen/NVPTX/i1-icmp.ll b/llvm/test/CodeGen/NVPTX/i1-icmp.ll index 620f09653c95..e43a9da88a50 100644 --- a/llvm/test/CodeGen/NVPTX/i1-icmp.ll +++ b/llvm/test/CodeGen/NVPTX/i1-icmp.ll @@ -11,9 +11,9 @@ define i32 @icmp_i1_eq(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_eq_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_eq_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_eq_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_eq_param_1]; ; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; ; CHECK-NEXT: xor.pred %p3, %p1, %p2; ; CHECK-NEXT: @%p3 bra $L__BB0_2; @@ -42,9 +42,9 @@ define i32 @icmp_i1_ne(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ne_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ne_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ne_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ne_param_1]; ; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; ; CHECK-NEXT: xor.pred %p3, %p1, %p2; ; CHECK-NEXT: not.pred %p4, %p3; @@ -74,9 +74,9 @@ define i32 @icmp_i1_sgt(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sgt_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_sgt_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sgt_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_sgt_param_1]; ; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2; ; CHECK-NEXT: or.pred %p3, %p1, %p2; ; CHECK-NEXT: @%p3 bra $L__BB2_2; @@ -105,9 +105,9 @@ define i32 @icmp_i1_slt(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_slt_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_slt_param_0]; ; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_slt_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_slt_param_1]; ; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; ; CHECK-NEXT: or.pred %p3, %p2, %p1; ; CHECK-NEXT: @%p3 bra $L__BB3_2; @@ -136,9 +136,9 @@ define i32 @icmp_i1_sge(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sge_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_sge_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sge_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_sge_param_1]; ; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2; ; CHECK-NEXT: and.pred %p3, %p1, %p2; ; CHECK-NEXT: @%p3 bra $L__BB4_2; @@ -167,9 +167,9 @@ define i32 @icmp_i1_sle(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_sle_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_sle_param_0]; ; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_sle_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_sle_param_1]; ; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; ; CHECK-NEXT: and.pred %p3, %p2, %p1; ; CHECK-NEXT: @%p3 bra $L__BB5_2; @@ -198,9 +198,9 @@ define i32 @icmp_i1_uge(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_uge_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_uge_param_0]; ; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_uge_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_uge_param_1]; ; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; ; CHECK-NEXT: and.pred %p3, %p2, %p1; ; CHECK-NEXT: @%p3 bra $L__BB6_2; @@ -229,9 +229,9 @@ define i32 @icmp_i1_ugt(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ugt_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ugt_param_0]; ; CHECK-NEXT: setp.lt.s32 %p1, %r1, 2; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ugt_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ugt_param_1]; ; CHECK-NEXT: setp.gt.s32 %p2, %r2, 1; ; CHECK-NEXT: or.pred %p3, %p2, %p1; ; CHECK-NEXT: @%p3 bra $L__BB7_2; @@ -260,9 +260,9 @@ define i32 @icmp_i1_ule(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ule_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ule_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ule_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ule_param_1]; ; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2; ; CHECK-NEXT: and.pred %p3, %p1, %p2; ; CHECK-NEXT: @%p3 bra $L__BB8_2; @@ -291,9 +291,9 @@ define i32 @icmp_i1_ult(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [icmp_i1_ult_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [icmp_i1_ult_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 1; -; CHECK-NEXT: ld.param.u32 %r2, [icmp_i1_ult_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [icmp_i1_ult_param_1]; ; CHECK-NEXT: setp.lt.s32 %p2, %r2, 2; ; CHECK-NEXT: or.pred %p3, %p1, %p2; ; CHECK-NEXT: @%p3 bra $L__BB9_2; diff --git a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll index 84fd8226bb60..50d39c88a46b 100644 --- a/llvm/test/CodeGen/NVPTX/i1-load-lower.ll +++ b/llvm/test/CodeGen/NVPTX/i1-load-lower.ll @@ -12,12 +12,12 @@ define void @foo() { ; CHECK: .reg .pred %p<2>; ; CHECK: .reg .b16 %rs<4>; ; CHECK-EMPTY: -; CHECK: ld.global.u8 %rs1, [i1g]; +; CHECK: ld.global.b8 %rs1, [i1g]; ; CHECK: and.b16 %rs2, %rs1, 1; ; CHECK: setp.ne.b16 %p1, %rs2, 0; ; CHECK: @%p1 bra $L__BB0_2; ; CHECK: mov.b16 %rs3, 1; -; CHECK: st.global.u8 [i1g], %rs3; +; CHECK: st.global.b8 [i1g], %rs3; ; CHECK: ret; %tmp = load i1, ptr addrspace(1) @i1g, align 2 br i1 %tmp, label %if.end, label %if.then diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index d24b06c4d721..6fb5aad4b1eb 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -11,16 +11,16 @@ define i32 @test_select_i1_trunc(i32 %a, i32 %b, i32 %c, i32 %true, i32 %false) ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 1; ; CHECK-NEXT: setp.ne.b32 %p1, %r2, 0; -; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_trunc_param_1]; -; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_trunc_param_2]; -; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_trunc_param_3]; +; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_trunc_param_1]; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_trunc_param_2]; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_trunc_param_3]; ; CHECK-NEXT: selp.b32 %r6, %r3, %r4, %p1; ; CHECK-NEXT: and.b32 %r7, %r6, 1; ; CHECK-NEXT: setp.ne.b32 %p2, %r7, 0; -; CHECK-NEXT: ld.param.u32 %r8, [test_select_i1_trunc_param_4]; +; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_trunc_param_4]; ; CHECK-NEXT: selp.b32 %r9, %r5, %r8, %p2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r9; ; CHECK-NEXT: ret; @@ -41,16 +41,16 @@ define i32 @test_select_i1_trunc_2(i64 %a, i16 %b, i32 %c, i32 %true, i32 %false ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_select_i1_trunc_2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_select_i1_trunc_2_param_0]; ; CHECK-NEXT: and.b64 %rd2, %rd1, 1; ; CHECK-NEXT: setp.ne.b64 %p1, %rd2, 0; -; CHECK-NEXT: ld.param.u16 %rs1, [test_select_i1_trunc_2_param_1]; -; CHECK-NEXT: ld.param.u16 %rs2, [test_select_i1_trunc_2_param_2]; -; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_trunc_2_param_3]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_select_i1_trunc_2_param_1]; +; CHECK-NEXT: ld.param.b16 %rs2, [test_select_i1_trunc_2_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_trunc_2_param_3]; ; CHECK-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; ; CHECK-NEXT: and.b16 %rs4, %rs3, 1; ; CHECK-NEXT: setp.ne.b16 %p2, %rs4, 0; -; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_trunc_2_param_4]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_trunc_2_param_4]; ; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -69,15 +69,15 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals ; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_param_1]; ; CHECK-NEXT: or.b32 %r4, %r1, %r2; ; CHECK-NEXT: setp.ne.s32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_param_2]; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_param_2]; ; CHECK-NEXT: setp.eq.s32 %p2, %r5, 0; -; CHECK-NEXT: ld.param.u32 %r7, [test_select_i1_basic_param_3]; +; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_param_3]; ; CHECK-NEXT: setp.eq.s32 %p3, %r4, 0; -; CHECK-NEXT: ld.param.u32 %r8, [test_select_i1_basic_param_4]; +; CHECK-NEXT: ld.param.b32 %r8, [test_select_i1_basic_param_4]; ; CHECK-NEXT: selp.b32 %r9, %r7, %r8, %p2; ; CHECK-NEXT: selp.b32 %r10, %r9, %r8, %p1; ; CHECK-NEXT: selp.b32 %r11, %r7, %r10, %p3; @@ -98,16 +98,16 @@ define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_select_i1_basic_folding_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0]; ; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.u32 %r2, [test_select_i1_basic_folding_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1]; ; CHECK-NEXT: setp.ne.s32 %p2, %r2, 0; ; CHECK-NEXT: setp.eq.s32 %p3, %r2, 0; -; CHECK-NEXT: ld.param.u32 %r3, [test_select_i1_basic_folding_param_2]; +; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; ; CHECK-NEXT: setp.eq.s32 %p4, %r3, 0; -; CHECK-NEXT: ld.param.u32 %r4, [test_select_i1_basic_folding_param_3]; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p6, %p1, %p3; -; CHECK-NEXT: ld.param.u32 %r5, [test_select_i1_basic_folding_param_4]; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; ; CHECK-NEXT: and.pred %p7, %p6, %p4; ; CHECK-NEXT: and.pred %p9, %p2, %p4; ; CHECK-NEXT: and.pred %p10, %p3, %p7; diff --git a/llvm/test/CodeGen/NVPTX/i128-array.ll b/llvm/test/CodeGen/NVPTX/i128-array.ll index dd6d48bd5862..3bb9c6aec51a 100644 --- a/llvm/test/CodeGen/NVPTX/i128-array.ll +++ b/llvm/test/CodeGen/NVPTX/i128-array.ll @@ -8,8 +8,8 @@ define [2 x i128] @foo(i64 %a, i32 %b) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [foo_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [foo_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; ; CHECK-NEXT: shr.s64 %rd2, %rd1, 63; ; CHECK-NEXT: cvt.s64.s32 %rd3, %r1; ; CHECK-NEXT: shr.s64 %rd4, %rd3, 63; @@ -30,8 +30,8 @@ define [2 x i128] @foo2(ptr byval([2 x i128]) %a) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [foo2_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd5, %rd6}, [foo2_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo2_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [foo2_param_0+16]; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; ; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd5, %rd6}; ; CHECK-NEXT: ret; @@ -51,8 +51,8 @@ define [2 x i128] @foo3([2 x i128] %a) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [foo3_param_0+16]; -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [foo3_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [foo3_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [foo3_param_0]; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; ; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll index 41cffe9cdbf9..6bf65d4d4ad6 100644 --- a/llvm/test/CodeGen/NVPTX/i128-ld-st.ll +++ b/llvm/test/CodeGen/NVPTX/i128-ld-st.ll @@ -10,11 +10,11 @@ define i128 @foo(ptr %p, ptr %o) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [foo_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; -; CHECK-NEXT: ld.u8 %rd3, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd2, [foo_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.b8 %rd3, [%rd1]; ; CHECK-NEXT: mov.b64 %rd4, 0; -; CHECK-NEXT: st.v2.u64 [%rd2], {%rd3, %rd4}; +; CHECK-NEXT: st.v2.b64 [%rd2], {%rd3, %rd4}; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; ; CHECK-NEXT: ret; %c = load i8, ptr %p, align 1 diff --git a/llvm/test/CodeGen/NVPTX/i128-param.ll b/llvm/test/CodeGen/NVPTX/i128-param.ll index 849d96f2cf3b..4f4c2fe73ba7 100644 --- a/llvm/test/CodeGen/NVPTX/i128-param.ll +++ b/llvm/test/CodeGen/NVPTX/i128-param.ll @@ -5,8 +5,8 @@ ; CHECK-NEXT: .param .align 16 .b8 callee_param_0[16], ; CHECK-NEXT: .param .align 16 .b8 callee_param_1[16], define void @callee(i128, i128, ptr) { - ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0]; - ; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1]; + ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0]; + ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [callee_param_1]; ; CHECK: mul.lo.s64 %[[REG4:rd[0-9]+]], %[[REG0]], %[[REG3]]; ; CHECK-NEXT: mul.hi.u64 %[[REG5:rd[0-9]+]], %[[REG0]], %[[REG2]]; @@ -25,8 +25,8 @@ define void @callee(i128, i128, ptr) { ; CHECK-NEXT: .param .align 16 .b8 caller_kernel_param_1[16], define ptx_kernel void @caller_kernel(i128, i128, ptr) { start: - ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0]; - ; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1]; + ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_kernel_param_0]; + ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_kernel_param_1]; ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0 ; CHECK: .param .align 16 .b8 param0[16]; @@ -44,8 +44,8 @@ start: ; CHECK-NEXT: .param .align 16 .b8 caller_func_param_1[16], define void @caller_func(i128, i128, ptr) { start: - ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0] - ; CHECK-DAG: ld.param.v2.u64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1] + ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_func_param_0] + ; CHECK-DAG: ld.param.v2.b64 {%[[REG2:rd[0-9]+]], %[[REG3:rd[0-9]+]]}, [caller_func_param_1] ; CHECK: { // callseq [[CALLSEQ_ID:[0-9]]], 0 ; CHECK: .param .align 16 .b8 param0[16]; diff --git a/llvm/test/CodeGen/NVPTX/i128-retval.ll b/llvm/test/CodeGen/NVPTX/i128-retval.ll index a01d14d5ca77..7fea1c43aad2 100644 --- a/llvm/test/CodeGen/NVPTX/i128-retval.ll +++ b/llvm/test/CodeGen/NVPTX/i128-retval.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: .visible .func (.param .align 16 .b8 func_retval0[16]) callee( define i128 @callee(i128) { - ; CHECK: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0]; + ; CHECK: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [callee_param_0]; ; CHECK: st.param.v2.b64 [func_retval0], {%[[REG0]], %[[REG1]]} ret i128 %0 } @@ -11,8 +11,8 @@ define i128 @callee(i128) { ; CHECK-LABEL: .visible .func caller( define void @caller(i128, ptr) { start: - ; CHECK-DAG: ld.param.v2.u64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0]; - ; CHECK-DAG: ld.param.u64 %[[OUT:rd[0-9]+]], [caller_param_1]; + ; CHECK-DAG: ld.param.v2.b64 {%[[REG0:rd[0-9]+]], %[[REG1:rd[0-9]+]]}, [caller_param_0]; + ; CHECK-DAG: ld.param.b64 %[[OUT:rd[0-9]+]], [caller_param_1]; ; CHECK: { // callseq 0, 0 ; CHECK: .param .align 16 .b8 retval0[16]; @@ -21,7 +21,7 @@ start: ; CHECK: } // callseq 0 %a = call i128 @callee(i128 %0) - ; CHECK-DAG: st.v2.u64 [%[[OUT]]], {%[[REG2]], %[[REG3]]}; + ; CHECK-DAG: st.v2.b64 [%[[OUT]]], {%[[REG2]], %[[REG3]]}; store i128 %a, ptr %1 ret void diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index f1ca19b30ac2..ecd42fd6ceb3 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -10,8 +10,8 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<127>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [srem_i128_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [srem_i128_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [srem_i128_param_1]; ; CHECK-NEXT: shr.s64 %rd2, %rd46, 63; ; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; ; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; @@ -151,8 +151,8 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<113>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [urem_i128_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [urem_i128_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [urem_i128_param_1]; ; CHECK-NEXT: or.b64 %rd45, %rd3, %rd4; ; CHECK-NEXT: setp.eq.s64 %p1, %rd45, 0; ; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42; @@ -275,7 +275,7 @@ define i128 @srem_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: .reg .b64 %rd<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [srem_i128_pow2k_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [srem_i128_pow2k_param_0]; ; CHECK-NEXT: shr.s64 %rd3, %rd2, 63; ; CHECK-NEXT: shr.u64 %rd4, %rd3, 31; ; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd4; @@ -295,7 +295,7 @@ define i128 @urem_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [urem_i128_pow2k_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [urem_i128_pow2k_param_0]; ; CHECK-NEXT: and.b64 %rd3, %rd1, 8589934591; ; CHECK-NEXT: mov.b64 %rd4, 0; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd3, %rd4}; @@ -312,8 +312,8 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<122>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.u64 {%rd45, %rd46}, [sdiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd49, %rd50}, [sdiv_i128_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd49, %rd50}, [sdiv_i128_param_1]; ; CHECK-NEXT: sub.cc.s64 %rd51, 0, %rd45; ; CHECK-NEXT: subc.cc.s64 %rd52, 0, %rd46; ; CHECK-NEXT: setp.lt.s64 %p1, %rd46, 0; @@ -448,8 +448,8 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<107>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases -; CHECK-NEXT: ld.param.v2.u64 {%rd41, %rd42}, [udiv_i128_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd43, %rd44}, [udiv_i128_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd43, %rd44}, [udiv_i128_param_1]; ; CHECK-NEXT: or.b64 %rd45, %rd43, %rd44; ; CHECK-NEXT: setp.eq.s64 %p1, %rd45, 0; ; CHECK-NEXT: or.b64 %rd46, %rd41, %rd42; @@ -566,7 +566,7 @@ define i128 @sdiv_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: .reg .b64 %rd<11>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [sdiv_i128_pow2k_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [sdiv_i128_pow2k_param_0]; ; CHECK-NEXT: shr.s64 %rd3, %rd2, 63; ; CHECK-NEXT: shr.u64 %rd4, %rd3, 31; ; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd4; @@ -587,7 +587,7 @@ define i128 @udiv_i128_pow2k(i128 %lhs) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [udiv_i128_pow2k_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [udiv_i128_pow2k_param_0]; ; CHECK-NEXT: shl.b64 %rd3, %rd2, 31; ; CHECK-NEXT: shr.u64 %rd4, %rd1, 33; ; CHECK-NEXT: or.b64 %rd5, %rd4, %rd3; @@ -604,8 +604,8 @@ define i128 @add_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [add_i128_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [add_i128_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [add_i128_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [add_i128_param_1]; ; CHECK-NEXT: add.cc.s64 %rd5, %rd1, %rd3; ; CHECK-NEXT: addc.cc.s64 %rd6, %rd2, %rd4; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd5, %rd6}; diff --git a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll index 1fd044cd2efc..5bfa5b2bc63a 100644 --- a/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i16x2-instructions.ll @@ -39,7 +39,7 @@ define i16 @test_extract_0(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; ; I16x2-NEXT: mov.b32 {%rs1, _}, %r1; ; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r1; } ; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; @@ -56,7 +56,7 @@ define i16 @test_extract_1(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; ; I16x2-NEXT: mov.b32 {_, %rs1}, %r1; ; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {tmp, %rs1}, %r1; } ; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; @@ -75,8 +75,8 @@ define i16 @test_extract_i(<2 x i16> %a, i64 %idx) #0 { ; COMMON-NEXT: .reg .b64 %rd<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_extract_i_param_0]; +; COMMON-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; COMMON-NEXT: setp.eq.s64 %p1, %rd1, 0; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: selp.b16 %rs3, %rs1, %rs2, %p1; @@ -93,8 +93,8 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1]; -; I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0]; +; I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0]; ; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; @@ -105,8 +105,8 @@ define <2 x i16> @test_add(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<4>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r2, [test_add_param_1]; -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; NO-I16x2-NEXT: add.s16 %rs5, %rs4, %rs2; @@ -125,7 +125,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; +; I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0]; ; I16x2-NEXT: mov.b32 %r2, 131073; ; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; @@ -137,7 +137,7 @@ define <2 x i16> @test_add_imm_0(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; @@ -154,7 +154,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; +; I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0]; ; I16x2-NEXT: mov.b32 %r2, 131073; ; I16x2-NEXT: add.s16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; @@ -166,7 +166,7 @@ define <2 x i16> @test_add_imm_1(<2 x i16> %a) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<3>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; NO-I16x2-NEXT: add.s16 %rs3, %rs2, 2; ; NO-I16x2-NEXT: add.s16 %rs4, %rs1, 1; @@ -184,8 +184,8 @@ define <2 x i16> @test_sub(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_sub_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_sub_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_sub_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_sub_param_0]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; COMMON-NEXT: sub.s16 %rs5, %rs4, %rs2; @@ -203,8 +203,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1]; -; I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0]; +; I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0]; ; I16x2-NEXT: max.s16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; @@ -215,8 +215,8 @@ define <2 x i16> @test_smax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<4>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smax_param_1]; -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smax_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smax_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; NO-I16x2-NEXT: max.s16 %rs5, %rs4, %rs2; @@ -235,8 +235,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1]; -; I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0]; +; I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0]; ; I16x2-NEXT: max.u16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; @@ -247,8 +247,8 @@ define <2 x i16> @test_umax(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<4>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umax_param_1]; -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umax_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umax_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; NO-I16x2-NEXT: max.u16 %rs5, %rs4, %rs2; @@ -267,8 +267,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1]; -; I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0]; +; I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0]; ; I16x2-NEXT: min.s16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; @@ -279,8 +279,8 @@ define <2 x i16> @test_smin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<4>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r2, [test_smin_param_1]; -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_smin_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_smin_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; NO-I16x2-NEXT: min.s16 %rs5, %rs4, %rs2; @@ -299,8 +299,8 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { ; I16x2-NEXT: .reg .b32 %r<4>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1]; -; I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0]; +; I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0]; ; I16x2-NEXT: min.u16x2 %r3, %r1, %r2; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; @@ -311,8 +311,8 @@ define <2 x i16> @test_umin(<2 x i16> %a, <2 x i16> %b) #0 { ; NO-I16x2-NEXT: .reg .b32 %r<4>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.u32 %r2, [test_umin_param_1]; -; NO-I16x2-NEXT: ld.param.u32 %r1, [test_umin_param_0]; +; NO-I16x2-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; NO-I16x2-NEXT: ld.param.b32 %r1, [test_umin_param_0]; ; NO-I16x2-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; NO-I16x2-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; NO-I16x2-NEXT: min.u16 %rs5, %rs4, %rs2; @@ -332,8 +332,8 @@ define <2 x i16> @test_mul(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_mul_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_mul_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_mul_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_mul_param_0]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r1; ; COMMON-NEXT: mul.lo.s16 %rs5, %rs4, %rs2; @@ -352,8 +352,8 @@ define <2 x i16> @test_or(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_or_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_or_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_or_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_or_param_0]; ; COMMON-NEXT: or.b32 %r3, %r1, %r2; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; @@ -370,7 +370,7 @@ define <2 x i16> @test_or_computed(i16 %a) { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u16 %rs1, [test_or_computed_param_0]; +; COMMON-NEXT: ld.param.b16 %rs1, [test_or_computed_param_0]; ; COMMON-NEXT: mov.b16 %rs2, 0; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; COMMON-NEXT: mov.b16 %rs3, 5; @@ -391,7 +391,7 @@ define <2 x i16> @test_or_imm_0(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_or_imm_0_param_0]; ; COMMON-NEXT: or.b32 %r2, %r1, 131073; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; ; COMMON-NEXT: ret; @@ -405,7 +405,7 @@ define <2 x i16> @test_or_imm_1(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_or_imm_1_param_0]; ; COMMON-NEXT: or.b32 %r2, %r1, 131073; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; ; COMMON-NEXT: ret; @@ -419,8 +419,8 @@ define <2 x i16> @test_xor(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_xor_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_xor_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_xor_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_xor_param_0]; ; COMMON-NEXT: xor.b32 %r3, %r1, %r2; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; @@ -435,7 +435,7 @@ define <2 x i16> @test_xor_computed(i16 %a) { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u16 %rs1, [test_xor_computed_param_0]; +; COMMON-NEXT: ld.param.b16 %rs1, [test_xor_computed_param_0]; ; COMMON-NEXT: mov.b16 %rs2, 0; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; COMMON-NEXT: mov.b16 %rs3, 5; @@ -456,7 +456,7 @@ define <2 x i16> @test_xor_imm_0(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_xor_imm_0_param_0]; ; COMMON-NEXT: xor.b32 %r2, %r1, 131073; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; ; COMMON-NEXT: ret; @@ -470,7 +470,7 @@ define <2 x i16> @test_xor_imm_1(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_xor_imm_1_param_0]; ; COMMON-NEXT: xor.b32 %r2, %r1, 131073; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; ; COMMON-NEXT: ret; @@ -484,8 +484,8 @@ define <2 x i16> @test_and(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_and_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_and_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_and_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_and_param_0]; ; COMMON-NEXT: and.b32 %r3, %r1, %r2; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; @@ -502,7 +502,7 @@ define <2 x i16> @test_and_computed(i16 %a) { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u16 %rs1, [test_and_computed_param_0]; +; COMMON-NEXT: ld.param.b16 %rs1, [test_and_computed_param_0]; ; COMMON-NEXT: mov.b16 %rs2, 0; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; COMMON-NEXT: mov.b16 %rs3, 5; @@ -523,7 +523,7 @@ define <2 x i16> @test_and_imm_0(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_and_imm_0_param_0]; ; COMMON-NEXT: and.b32 %r2, %r1, 131073; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; ; COMMON-NEXT: ret; @@ -537,7 +537,7 @@ define <2 x i16> @test_and_imm_1(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_and_imm_1_param_0]; ; COMMON-NEXT: and.b32 %r2, %r1, 131073; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; ; COMMON-NEXT: ret; @@ -552,10 +552,10 @@ define void @test_ldst_v2i16(ptr %a, ptr %b) { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v2i16_param_1]; -; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v2i16_param_0]; -; COMMON-NEXT: ld.u32 %r1, [%rd1]; -; COMMON-NEXT: st.u32 [%rd2], %r1; +; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v2i16_param_1]; +; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v2i16_param_0]; +; COMMON-NEXT: ld.b32 %r1, [%rd1]; +; COMMON-NEXT: st.b32 [%rd2], %r1; ; COMMON-NEXT: ret; %t1 = load <2 x i16>, ptr %a store <2 x i16> %t1, ptr %b, align 16 @@ -572,12 +572,12 @@ define void @test_ldst_v3i16(ptr %a, ptr %b) { ; COMMON-NEXT: .reg .b64 %rd<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v3i16_param_1]; -; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v3i16_param_0]; -; COMMON-NEXT: ld.u64 %rd3, [%rd1]; +; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v3i16_param_1]; +; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v3i16_param_0]; +; COMMON-NEXT: ld.b64 %rd3, [%rd1]; ; COMMON-NEXT: shr.u64 %rd4, %rd3, 32; -; COMMON-NEXT: st.u32 [%rd2], %rd3; -; COMMON-NEXT: st.u16 [%rd2+4], %rd4; +; COMMON-NEXT: st.b32 [%rd2], %rd3; +; COMMON-NEXT: st.b16 [%rd2+4], %rd4; ; COMMON-NEXT: ret; %t1 = load <3 x i16>, ptr %a store <3 x i16> %t1, ptr %b, align 16 @@ -591,10 +591,10 @@ define void @test_ldst_v4i16(ptr %a, ptr %b) { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v4i16_param_1]; -; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v4i16_param_0]; -; COMMON-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; COMMON-NEXT: st.v4.u16 [%rd2], {%rs1, %rs2, %rs3, %rs4}; +; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v4i16_param_1]; +; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v4i16_param_0]; +; COMMON-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; COMMON-NEXT: st.v4.b16 [%rd2], {%rs1, %rs2, %rs3, %rs4}; ; COMMON-NEXT: ret; %t1 = load <4 x i16>, ptr %a store <4 x i16> %t1, ptr %b, align 16 @@ -608,8 +608,8 @@ define void @test_ldst_v8i16(ptr %a, ptr %b) { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u64 %rd2, [test_ldst_v8i16_param_1]; -; COMMON-NEXT: ld.param.u64 %rd1, [test_ldst_v8i16_param_0]; +; COMMON-NEXT: ld.param.b64 %rd2, [test_ldst_v8i16_param_1]; +; COMMON-NEXT: ld.param.b64 %rd1, [test_ldst_v8i16_param_0]; ; COMMON-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; COMMON-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; COMMON-NEXT: ret; @@ -626,8 +626,8 @@ define <2 x i16> @test_call(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_call_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_call_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_call_param_0]; ; COMMON-NEXT: { // callseq 0, 0 ; COMMON-NEXT: .param .align 4 .b8 param0[4]; ; COMMON-NEXT: st.param.b32 [param0], %r1; @@ -654,8 +654,8 @@ define <2 x i16> @test_call_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; ; COMMON-NEXT: { // callseq 1, 0 ; COMMON-NEXT: .param .align 4 .b8 param0[4]; ; COMMON-NEXT: st.param.b32 [param0], %r2; @@ -682,8 +682,8 @@ define <2 x i16> @test_tailcall_flipped(<2 x i16> %a, <2 x i16> %b) #0 { ; COMMON-NEXT: .reg .b32 %r<5>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; ; COMMON-NEXT: { // callseq 2, 0 ; COMMON-NEXT: .param .align 4 .b8 param0[4]; ; COMMON-NEXT: st.param.b32 [param0], %r2; @@ -712,11 +712,11 @@ define <2 x i16> @test_select(<2 x i16> %a, <2 x i16> %b, i1 zeroext %c) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; COMMON-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; COMMON-NEXT: and.b16 %rs2, %rs1, 1; ; COMMON-NEXT: setp.ne.b16 %p1, %rs2, 0; -; COMMON-NEXT: ld.param.u32 %r2, [test_select_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_select_param_0]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_param_0]; ; COMMON-NEXT: selp.b32 %r3, %r1, %r2, %p1; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; @@ -732,10 +732,10 @@ define <2 x i16> @test_select_cc(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x ; COMMON-NEXT: .reg .b32 %r<6>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; -; COMMON-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; -; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; +; COMMON-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; COMMON-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r3; ; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; @@ -760,10 +760,10 @@ define <2 x i32> @test_select_cc_i32_i16(<2 x i32> %a, <2 x i32> %b, ; COMMON-NEXT: .reg .b32 %r<9>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; -; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; -; COMMON-NEXT: ld.param.u32 %r6, [test_select_cc_i32_i16_param_3]; -; COMMON-NEXT: ld.param.u32 %r5, [test_select_cc_i32_i16_param_2]; +; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i32_i16_param_1]; +; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_select_cc_i32_i16_param_0]; +; COMMON-NEXT: ld.param.b32 %r6, [test_select_cc_i32_i16_param_3]; +; COMMON-NEXT: ld.param.b32 %r5, [test_select_cc_i32_i16_param_2]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r6; ; COMMON-NEXT: mov.b32 {%rs3, %rs4}, %r5; ; COMMON-NEXT: setp.ne.s16 %p1, %rs3, %rs1; @@ -786,10 +786,10 @@ define <2 x i16> @test_select_cc_i16_i32(<2 x i16> %a, <2 x i16> %b, ; COMMON-NEXT: .reg .b32 %r<8>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.u32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; -; COMMON-NEXT: ld.param.v2.u32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; -; COMMON-NEXT: ld.param.u32 %r2, [test_select_cc_i16_i32_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_select_cc_i16_i32_param_0]; +; COMMON-NEXT: ld.param.v2.b32 {%r5, %r6}, [test_select_cc_i16_i32_param_3]; +; COMMON-NEXT: ld.param.v2.b32 {%r3, %r4}, [test_select_cc_i16_i32_param_2]; +; COMMON-NEXT: ld.param.b32 %r2, [test_select_cc_i16_i32_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_select_cc_i16_i32_param_0]; ; COMMON-NEXT: setp.ne.s32 %p1, %r3, %r5; ; COMMON-NEXT: setp.ne.s32 %p2, %r4, %r6; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r2; @@ -812,7 +812,7 @@ define <2 x i16> @test_trunc_2xi32(<2 x i32> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_param_0]; +; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_param_0]; ; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; @@ -827,12 +827,12 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 { ; I16x2-NEXT: .reg .b64 %rd<2>; ; I16x2-EMPTY: ; I16x2-NEXT: // %bb.0: -; I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; -; I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; +; I16x2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; +; I16x2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; ; I16x2-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; ; I16x2-NEXT: mov.b32 %r4, 65537; ; I16x2-NEXT: add.s16x2 %r5, %r3, %r4; -; I16x2-NEXT: st.u32 [%rd1], %r5; +; I16x2-NEXT: st.b32 [%rd1], %r5; ; I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; I16x2-NEXT: ret; ; @@ -843,15 +843,15 @@ define <2 x i16> @test_trunc_2xi32_muliple_use0(<2 x i32> %a, ptr %p) #0 { ; NO-I16x2-NEXT: .reg .b64 %rd<2>; ; NO-I16x2-EMPTY: ; NO-I16x2-NEXT: // %bb.0: -; NO-I16x2-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; -; NO-I16x2-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; +; NO-I16x2-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use0_param_0]; +; NO-I16x2-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use0_param_1]; ; NO-I16x2-NEXT: cvt.u16.u32 %rs1, %r2; ; NO-I16x2-NEXT: cvt.u16.u32 %rs2, %r1; ; NO-I16x2-NEXT: mov.b32 %r3, {%rs2, %rs1}; ; NO-I16x2-NEXT: add.s16 %rs3, %rs1, 1; ; NO-I16x2-NEXT: add.s16 %rs4, %rs2, 1; ; NO-I16x2-NEXT: mov.b32 %r4, {%rs4, %rs3}; -; NO-I16x2-NEXT: st.u32 [%rd1], %r4; +; NO-I16x2-NEXT: st.b32 [%rd1], %r4; ; NO-I16x2-NEXT: st.param.b32 [func_retval0], %r3; ; NO-I16x2-NEXT: ret; %r = trunc <2 x i32> %a to <2 x i16> @@ -869,12 +869,12 @@ define <2 x i16> @test_trunc_2xi32_muliple_use1(<2 x i32> %a, ptr %p) #0 { ; COMMON-NEXT: .reg .b64 %rd<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0]; -; COMMON-NEXT: ld.param.u64 %rd1, [test_trunc_2xi32_muliple_use1_param_1]; +; COMMON-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_trunc_2xi32_muliple_use1_param_0]; +; COMMON-NEXT: ld.param.b64 %rd1, [test_trunc_2xi32_muliple_use1_param_1]; ; COMMON-NEXT: prmt.b32 %r3, %r1, %r2, 0x5410U; ; COMMON-NEXT: add.s32 %r4, %r2, 1; ; COMMON-NEXT: add.s32 %r5, %r1, 1; -; COMMON-NEXT: st.v2.u32 [%rd1], {%r5, %r4}; +; COMMON-NEXT: st.v2.b32 [%rd1], {%r5, %r4}; ; COMMON-NEXT: st.param.b32 [func_retval0], %r3; ; COMMON-NEXT: ret; %r = trunc <2 x i32> %a to <2 x i16> @@ -893,7 +893,7 @@ define <2 x i16> @test_trunc_2xi64(<2 x i64> %a) #0 { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; +; COMMON-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; ; COMMON-NEXT: cvt.u16.u64 %rs1, %rd2; ; COMMON-NEXT: cvt.u16.u64 %rs2, %rd1; ; COMMON-NEXT: mov.b32 %r1, {%rs2, %rs1}; @@ -910,7 +910,7 @@ define <2 x i32> @test_zext_2xi32(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<4>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.u32.u16 %r2, %rs1; ; COMMON-NEXT: cvt.u32.u16 %r3, %rs2; @@ -928,7 +928,7 @@ define <2 x i64> @test_zext_2xi64(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b64 %rd<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: cvt.u64.u16 %rd1, %rs2; ; COMMON-NEXT: cvt.u64.u16 %rd2, %rs1; @@ -944,7 +944,7 @@ define <2 x i16> @test_bitcast_i32_to_2xi16(i32 %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_2xi16_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_bitcast_i32_to_2xi16_param_0]; ; COMMON-NEXT: st.param.b32 [func_retval0], %r1; ; COMMON-NEXT: ret; %r = bitcast i32 %a to <2 x i16> @@ -957,7 +957,7 @@ define i32 @test_bitcast_2xi16_to_i32(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_bitcast_2xi16_to_i32_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_bitcast_2xi16_to_i32_param_0]; ; COMMON-NEXT: st.param.b32 [func_retval0], %r1; ; COMMON-NEXT: ret; %r = bitcast <2 x i16> %a to i32 @@ -971,7 +971,7 @@ define <2 x half> @test_bitcast_2xi16_to_2xhalf(i16 %a) #0 { ; COMMON-NEXT: .reg .b32 %r<2>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0]; +; COMMON-NEXT: ld.param.b16 %rs1, [test_bitcast_2xi16_to_2xhalf_param_0]; ; COMMON-NEXT: mov.b16 %rs2, 5; ; COMMON-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; COMMON-NEXT: st.param.b32 [func_retval0], %r1; @@ -990,7 +990,7 @@ define <2 x i16> @test_shufflevector(<2 x i16> %a) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; +; COMMON-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; ; COMMON-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1}; ; COMMON-NEXT: st.param.b32 [func_retval0], %r2; @@ -1006,8 +1006,8 @@ define <2 x i16> @test_insertelement(<2 x i16> %a, i16 %x) #0 { ; COMMON-NEXT: .reg .b32 %r<3>; ; COMMON-EMPTY: ; COMMON-NEXT: // %bb.0: -; COMMON-NEXT: ld.param.u16 %rs1, [test_insertelement_param_1]; -; COMMON-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; +; COMMON-NEXT: ld.param.b16 %rs1, [test_insertelement_param_1]; +; COMMON-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; ; I16x2-NEXT: mov.b32 {%rs2, _}, %r1; ; NO-I16x2-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r1; } ; COMMON-NEXT: mov.b32 %r2, {%rs2, %rs1}; diff --git a/llvm/test/CodeGen/NVPTX/i8-param.ll b/llvm/test/CodeGen/NVPTX/i8-param.ll index 7c5134696c25..0679ba0fe572 100644 --- a/llvm/test/CodeGen/NVPTX/i8-param.ll +++ b/llvm/test/CodeGen/NVPTX/i8-param.ll @@ -5,7 +5,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK: .visible .func (.param .b32 func_retval0) callee define i8 @callee(i8 %a) { -; CHECK: ld.param.u8 +; CHECK: ld.param.b8 %ret = add i8 %a, 42 ; CHECK: st.param.b32 ret i8 %ret @@ -13,7 +13,7 @@ define i8 @callee(i8 %a) { ; CHECK: .visible .func caller define void @caller(ptr %a) { -; CHECK: ld.u8 +; CHECK: ld.b8 %val = load i8, ptr %a %ret = tail call i8 @callee(i8 %val) ; CHECK: ld.param.b32 diff --git a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll index e9662dd8a7fa..fe8113489592 100644 --- a/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x2-instructions.ll @@ -17,7 +17,7 @@ define i16 @test_bitcast_2xi8_i16(<2 x i8> %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_2xi8_i16_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_2xi8_i16_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: shl.b16 %rs3, %rs2, 8; ; CHECK-NEXT: and.b16 %rs4, %rs1, 255; @@ -36,7 +36,7 @@ define <2 x i8> @test_bitcast_i16_2xi8(i16 %a) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [test_bitcast_i16_2xi8_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_bitcast_i16_2xi8_param_0]; ; CHECK-NEXT: shr.u16 %rs2, %rs1, 8; ; CHECK-NEXT: mov.b32 %r1, {%rs1, %rs2}; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index 65edcf2e0715..642d5d0e538a 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -29,7 +29,7 @@ define i8 @test_extract_0(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_extract_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_0_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -43,7 +43,7 @@ define i8 @test_extract_1(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_extract_1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_1_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 8, 8; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -57,7 +57,7 @@ define i8 @test_extract_2(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_extract_2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_2_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -71,7 +71,7 @@ define i8 @test_extract_3(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_extract_3_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_3_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -86,8 +86,8 @@ define i8 @test_extract_i(<4 x i8> %a, i64 %idx) #0 { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_extract_i_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_extract_i_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_extract_i_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_extract_i_param_0]; ; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; ; CHECK-NEXT: shl.b32 %r3, %r2, 3; ; CHECK-NEXT: bfe.u32 %r4, %r1, %r3, 8; @@ -104,8 +104,8 @@ define <4 x i8> @test_add(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_add_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_add_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_add_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_add_param_0]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; ; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; @@ -146,7 +146,7 @@ define <4 x i8> @test_add_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_add_imm_0_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 4; @@ -179,7 +179,7 @@ define <4 x i8> @test_add_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_add_imm_1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_add_imm_1_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 4; @@ -212,8 +212,8 @@ define <4 x i8> @test_sub(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_sub_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_sub_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_sub_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sub_param_0]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; ; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; @@ -254,8 +254,8 @@ define <4 x i8> @test_smax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_smax_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_smax_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_smax_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_smax_param_0]; ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; ; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.gt.s32 %p1, %r4, %r3; @@ -297,8 +297,8 @@ define <4 x i8> @test_umax(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_umax_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_umax_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_umax_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_umax_param_0]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; @@ -332,8 +332,8 @@ define <4 x i8> @test_smin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_smin_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_smin_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_smin_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_smin_param_0]; ; CHECK-NEXT: bfe.s32 %r3, %r2, 0, 8; ; CHECK-NEXT: bfe.s32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.le.s32 %p1, %r4, %r3; @@ -375,8 +375,8 @@ define <4 x i8> @test_umin(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_umin_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_umin_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_umin_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_umin_param_0]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.ls.u32 %p1, %r4, %r3; @@ -410,9 +410,9 @@ define <4 x i8> @test_eq(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r3, [test_eq_param_2]; -; CHECK-NEXT: ld.param.u32 %r2, [test_eq_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_eq_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_eq_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_eq_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_eq_param_0]; ; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; ; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: setp.eq.u32 %p1, %r5, %r4; @@ -450,9 +450,9 @@ define <4 x i8> @test_ne(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c) #0 { ; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r3, [test_ne_param_2]; -; CHECK-NEXT: ld.param.u32 %r2, [test_ne_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_ne_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [test_ne_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_ne_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_ne_param_0]; ; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; ; CHECK-NEXT: bfe.u32 %r5, %r1, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r5, %r4; @@ -490,8 +490,8 @@ define <4 x i8> @test_mul(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<18>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_mul_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_mul_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_mul_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_mul_param_0]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; ; CHECK-NEXT: bfe.u32 %r4, %r1, 24, 8; @@ -531,8 +531,8 @@ define <4 x i8> @test_or(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_or_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_or_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_or_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_or_param_0]; ; CHECK-NEXT: or.b32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -547,7 +547,7 @@ define <4 x i8> @test_or_computed(i8 %a) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_or_computed_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_or_computed_param_0]; ; CHECK-NEXT: mov.b32 %r1, 0; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; @@ -569,7 +569,7 @@ define <4 x i8> @test_or_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_or_imm_0_param_0]; ; CHECK-NEXT: or.b32 %r2, %r1, 67305985; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -583,7 +583,7 @@ define <4 x i8> @test_or_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_or_imm_1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_or_imm_1_param_0]; ; CHECK-NEXT: or.b32 %r2, %r1, 67305985; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -597,8 +597,8 @@ define <4 x i8> @test_xor(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_xor_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_xor_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_xor_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_xor_param_0]; ; CHECK-NEXT: xor.b32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -613,7 +613,7 @@ define <4 x i8> @test_xor_computed(i8 %a) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_xor_computed_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_xor_computed_param_0]; ; CHECK-NEXT: mov.b32 %r1, 0; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; @@ -635,7 +635,7 @@ define <4 x i8> @test_xor_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_xor_imm_0_param_0]; ; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -649,7 +649,7 @@ define <4 x i8> @test_xor_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_xor_imm_1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_xor_imm_1_param_0]; ; CHECK-NEXT: xor.b32 %r2, %r1, 67305985; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -663,8 +663,8 @@ define <4 x i8> @test_and(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_and_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_and_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_and_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_and_param_0]; ; CHECK-NEXT: and.b32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -679,7 +679,7 @@ define <4 x i8> @test_and_computed(i8 %a) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_and_computed_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_and_computed_param_0]; ; CHECK-NEXT: mov.b32 %r1, 0; ; CHECK-NEXT: prmt.b32 %r2, %r1, 0, 0x3340U; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; @@ -701,7 +701,7 @@ define <4 x i8> @test_and_imm_0(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_0_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_and_imm_0_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 67305985; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -715,7 +715,7 @@ define <4 x i8> @test_and_imm_1(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_and_imm_1_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_and_imm_1_param_0]; ; CHECK-NEXT: and.b32 %r2, %r1, 67305985; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -730,10 +730,10 @@ define void @test_ldst_v2i8(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v2i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v2i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: st.u32 [%rd2], %r1; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v2i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v2i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: st.b32 [%rd2], %r1; ; CHECK-NEXT: ret; %t1 = load <4 x i8>, ptr %a store <4 x i8> %t1, ptr %b, align 16 @@ -747,12 +747,12 @@ define void @test_ldst_v3i8(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v3i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v3i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: st.u16 [%rd2], %r1; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v3i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v3i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: st.b16 [%rd2], %r1; ; CHECK-NEXT: bfe.u32 %r2, %r1, 16, 8; -; CHECK-NEXT: st.u8 [%rd2+2], %r2; +; CHECK-NEXT: st.b8 [%rd2+2], %r2; ; CHECK-NEXT: ret; %t1 = load <3 x i8>, ptr %a store <3 x i8> %t1, ptr %b, align 16 @@ -766,10 +766,10 @@ define void @test_ldst_v4i8(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: st.u32 [%rd2], %r1; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: st.b32 [%rd2], %r1; ; CHECK-NEXT: ret; %t1 = load <4 x i8>, ptr %a store <4 x i8> %t1, ptr %b, align 16 @@ -783,16 +783,16 @@ define void @test_ldst_v4i8_unaligned(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v4i8_unaligned_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v4i8_unaligned_param_0]; -; CHECK-NEXT: ld.u8 %r1, [%rd1]; -; CHECK-NEXT: ld.u8 %r2, [%rd1+1]; -; CHECK-NEXT: ld.u8 %r3, [%rd1+2]; -; CHECK-NEXT: ld.u8 %r4, [%rd1+3]; -; CHECK-NEXT: st.u8 [%rd2+3], %r4; -; CHECK-NEXT: st.u8 [%rd2+2], %r3; -; CHECK-NEXT: st.u8 [%rd2+1], %r2; -; CHECK-NEXT: st.u8 [%rd2], %r1; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v4i8_unaligned_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v4i8_unaligned_param_0]; +; CHECK-NEXT: ld.b8 %r1, [%rd1]; +; CHECK-NEXT: ld.b8 %r2, [%rd1+1]; +; CHECK-NEXT: ld.b8 %r3, [%rd1+2]; +; CHECK-NEXT: ld.b8 %r4, [%rd1+3]; +; CHECK-NEXT: st.b8 [%rd2+3], %r4; +; CHECK-NEXT: st.b8 [%rd2+2], %r3; +; CHECK-NEXT: st.b8 [%rd2+1], %r2; +; CHECK-NEXT: st.b8 [%rd2], %r1; ; CHECK-NEXT: ret; %t1 = load <4 x i8>, ptr %a, align 1 store <4 x i8> %t1, ptr %b, align 1 @@ -807,8 +807,8 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_ldst_v8i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldst_v8i8_param_0]; ; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; ; CHECK-NEXT: ret; @@ -825,8 +825,8 @@ define <4 x i8> @test_call(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_call_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_call_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_call_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_call_param_0]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0], %r1; @@ -853,8 +853,8 @@ define <4 x i8> @test_call_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_call_flipped_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_call_flipped_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_call_flipped_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_call_flipped_param_0]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0], %r2; @@ -881,8 +881,8 @@ define <4 x i8> @test_tailcall_flipped(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_tailcall_flipped_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_tailcall_flipped_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_tailcall_flipped_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tailcall_flipped_param_0]; ; CHECK-NEXT: { // callseq 2, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.b32 [param0], %r2; @@ -911,11 +911,11 @@ define <4 x i8> @test_select(<4 x i8> %a, <4 x i8> %b, i1 zeroext %c) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_select_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_select_param_2]; ; CHECK-NEXT: and.b16 %rs2, %rs1, 1; ; CHECK-NEXT: setp.ne.b16 %p1, %rs2, 0; -; CHECK-NEXT: ld.param.u32 %r2, [test_select_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_select_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_param_0]; ; CHECK-NEXT: selp.b32 %r3, %r1, %r2, %p1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -930,10 +930,10 @@ define <4 x i8> @test_select_cc(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> ; CHECK-NEXT: .reg .b32 %r<28>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r4, [test_select_cc_param_3]; -; CHECK-NEXT: ld.param.u32 %r3, [test_select_cc_param_2]; -; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_param_0]; +; CHECK-NEXT: ld.param.b32 %r4, [test_select_cc_param_3]; +; CHECK-NEXT: ld.param.b32 %r3, [test_select_cc_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_param_0]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 0, 8; ; CHECK-NEXT: bfe.u32 %r6, %r3, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r6, %r5; @@ -975,10 +975,10 @@ define <4 x i32> @test_select_cc_i32_i8(<4 x i32> %a, <4 x i32> %b, ; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; -; CHECK-NEXT: ld.param.u32 %r10, [test_select_cc_i32_i8_param_3]; -; CHECK-NEXT: ld.param.u32 %r9, [test_select_cc_i32_i8_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [test_select_cc_i32_i8_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_select_cc_i32_i8_param_0]; +; CHECK-NEXT: ld.param.b32 %r10, [test_select_cc_i32_i8_param_3]; +; CHECK-NEXT: ld.param.b32 %r9, [test_select_cc_i32_i8_param_2]; ; CHECK-NEXT: bfe.u32 %r11, %r10, 0, 8; ; CHECK-NEXT: bfe.u32 %r12, %r9, 0, 8; ; CHECK-NEXT: setp.ne.u32 %p1, %r12, %r11; @@ -1010,10 +1010,10 @@ define <4 x i8> @test_select_cc_i8_i32(<4 x i8> %a, <4 x i8> %b, ; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; -; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; -; CHECK-NEXT: ld.param.u32 %r2, [test_select_cc_i8_i32_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_select_cc_i8_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_select_cc_i8_i32_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_select_cc_i8_i32_param_2]; +; CHECK-NEXT: ld.param.b32 %r2, [test_select_cc_i8_i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_select_cc_i8_i32_param_0]; ; CHECK-NEXT: setp.ne.s32 %p1, %r3, %r7; ; CHECK-NEXT: setp.ne.s32 %p2, %r4, %r8; ; CHECK-NEXT: setp.ne.s32 %p3, %r5, %r9; @@ -1048,7 +1048,7 @@ define <4 x i8> @test_trunc_2xi32(<4 x i32> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [test_trunc_2xi32_param_0]; ; CHECK-NEXT: prmt.b32 %r5, %r3, %r4, 0x3340U; ; CHECK-NEXT: prmt.b32 %r6, %r1, %r2, 0x3340U; ; CHECK-NEXT: prmt.b32 %r7, %r6, %r5, 0x5410U; @@ -1065,8 +1065,8 @@ define <4 x i8> @test_trunc_2xi64(<4 x i64> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [test_trunc_2xi64_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [test_trunc_2xi64_param_0]; ; CHECK-NEXT: cvt.u32.u64 %r1, %rd4; ; CHECK-NEXT: cvt.u32.u64 %r2, %rd3; ; CHECK-NEXT: prmt.b32 %r3, %r2, %r1, 0x3340U; @@ -1086,7 +1086,7 @@ define <4 x i32> @test_zext_2xi32(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi32_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: bfe.u32 %r3, %r1, 16, 8; ; CHECK-NEXT: bfe.u32 %r4, %r1, 8, 8; @@ -1104,7 +1104,7 @@ define <4 x i64> @test_zext_2xi64(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b64 %rd<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_zext_2xi64_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_zext_2xi64_param_0]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u64.u32 %rd1, %r2; ; CHECK-NEXT: and.b64 %rd2, %rd1, 255; @@ -1130,7 +1130,7 @@ define <4 x i8> @test_bitcast_i32_to_4xi8(i32 %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_i32_to_4xi8_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_i32_to_4xi8_param_0]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast i32 %a to <4 x i8> @@ -1144,7 +1144,7 @@ define <4 x i8> @test_bitcast_float_to_4xi8(float %a) #0 { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [test_bitcast_float_to_4xi8_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [test_bitcast_float_to_4xi8_param_0]; ; CHECK-NEXT: mov.b32 %r1, %f1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -1158,7 +1158,7 @@ define i32 @test_bitcast_4xi8_to_i32(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_4xi8_to_i32_param_0]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %r = bitcast <4 x i8> %a to i32 @@ -1172,9 +1172,9 @@ define float @test_bitcast_4xi8_to_float(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitcast_4xi8_to_float_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitcast_4xi8_to_float_param_0]; ; CHECK-NEXT: mov.b32 %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: st.param.b32 [func_retval0], %f1; ; CHECK-NEXT: ret; %r = bitcast <4 x i8> %a to float ret float %r @@ -1188,7 +1188,7 @@ define <2 x half> @test_bitcast_4xi8_to_2xhalf(i8 %a) #0 { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_bitcast_4xi8_to_2xhalf_param_0]; ; CHECK-NEXT: mov.b32 %r1, 6; ; CHECK-NEXT: prmt.b32 %r2, %r1, 7, 0x3340U; ; CHECK-NEXT: cvt.u32.u16 %r3, %rs1; @@ -1211,7 +1211,7 @@ define <4 x i8> @test_shufflevector(<4 x i8> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_param_0]; ; CHECK-NEXT: // implicit-def: %r3 ; CHECK-NEXT: prmt.b32 %r2, %r1, %r3, 0x123U; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; @@ -1226,8 +1226,8 @@ define <4 x i8> @test_shufflevector_2(<4 x i8> %a, <4 x i8> %b) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r2, [test_shufflevector_2_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_shufflevector_2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_shufflevector_2_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_shufflevector_2_param_0]; ; CHECK-NEXT: prmt.b32 %r3, %r1, %r2, 0x2537U; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -1243,8 +1243,8 @@ define <4 x i8> @test_insertelement(<4 x i8> %a, i8 %x) #0 { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [test_insertelement_param_1]; -; CHECK-NEXT: ld.param.u32 %r1, [test_insertelement_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [test_insertelement_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_insertelement_param_0]; ; CHECK-NEXT: cvt.u32.u16 %r2, %rs1; ; CHECK-NEXT: bfi.b32 %r3, %r2, %r1, 8, 8; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -1260,7 +1260,7 @@ define <4 x i8> @test_fptosi_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptosi_4xhalf_to_4xi8_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.s16.f16 %rs4, %rs1; @@ -1291,7 +1291,7 @@ define <4 x i8> @test_fptoui_4xhalf_to_4xi8(<4 x half> %a) #0 { ; CHECK-NEXT: .reg .b32 %r<12>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_fptoui_4xhalf_to_4xi8_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs3, %rs2; ; CHECK-NEXT: cvt.rzi.u16.f16 %rs4, %rs1; @@ -1323,11 +1323,11 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v4i8_param_2]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v4i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v4i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: ld.u32 %r2, [%rd2]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_srem_v4i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_srem_v4i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v4i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.b32 %r2, [%rd2]; ; CHECK-NEXT: bfe.s32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.s8.s32 %rs1, %r3; ; CHECK-NEXT: bfe.s32 %r4, %r1, 24, 8; @@ -1355,7 +1355,7 @@ define void @test_srem_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: cvt.u32.u16 %r15, %rs12; ; CHECK-NEXT: prmt.b32 %r16, %r15, %r12, 0x3340U; ; CHECK-NEXT: prmt.b32 %r17, %r16, %r9, 0x5410U; -; CHECK-NEXT: st.u32 [%rd3], %r17; +; CHECK-NEXT: st.b32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: %t57 = load <4 x i8>, ptr %a, align 4 @@ -1379,17 +1379,17 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd3, [test_srem_v3i8_param_2]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_srem_v3i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_srem_v3i8_param_0]; -; CHECK-NEXT: ld.u8 %rs1, [%rd1]; -; CHECK-NEXT: ld.u8 %rs2, [%rd1+1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_srem_v3i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_srem_v3i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_srem_v3i8_param_0]; +; CHECK-NEXT: ld.b8 %rs1, [%rd1]; +; CHECK-NEXT: ld.b8 %rs2, [%rd1+1]; ; CHECK-NEXT: shl.b16 %rs3, %rs2, 8; ; CHECK-NEXT: or.b16 %rs4, %rs3, %rs1; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; ; CHECK-NEXT: ld.s8 %rs5, [%rd1+2]; -; CHECK-NEXT: ld.u8 %rs6, [%rd2]; -; CHECK-NEXT: ld.u8 %rs7, [%rd2+1]; +; CHECK-NEXT: ld.b8 %rs6, [%rd2]; +; CHECK-NEXT: ld.b8 %rs7, [%rd2+1]; ; CHECK-NEXT: shl.b16 %rs8, %rs7, 8; ; CHECK-NEXT: or.b16 %rs9, %rs8, %rs6; ; CHECK-NEXT: cvt.u32.u16 %r2, %rs9; @@ -1413,10 +1413,10 @@ define void @test_srem_v3i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: prmt.b32 %r13, %r9, %r10, 0x5410U; ; CHECK-NEXT: rem.s16 %rs17, %rs5, %rs10; ; CHECK-NEXT: mov.b32 {%rs18, _}, %r13; -; CHECK-NEXT: st.u8 [%rd3], %rs18; +; CHECK-NEXT: st.b8 [%rd3], %rs18; ; CHECK-NEXT: shr.u16 %rs19, %rs18, 8; -; CHECK-NEXT: st.u8 [%rd3+1], %rs19; -; CHECK-NEXT: st.u8 [%rd3+2], %rs17; +; CHECK-NEXT: st.b8 [%rd3+1], %rs19; +; CHECK-NEXT: st.b8 [%rd3+2], %rs17; ; CHECK-NEXT: ret; entry: %t57 = load <3 x i8>, ptr %a, align 1 @@ -1434,11 +1434,11 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd3, [test_sext_v4i1_to_v4i8_param_2]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_sext_v4i1_to_v4i8_param_1]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: ld.u32 %r2, [%rd2]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_sext_v4i1_to_v4i8_param_2]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_sext_v4i1_to_v4i8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sext_v4i1_to_v4i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; +; CHECK-NEXT: ld.b32 %r2, [%rd2]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 0, 8; ; CHECK-NEXT: bfe.u32 %r4, %r1, 0, 8; ; CHECK-NEXT: setp.hi.u32 %p1, %r4, %r3; @@ -1458,7 +1458,7 @@ define void @test_sext_v4i1_to_v4i8(ptr %a, ptr %b, ptr %c) { ; CHECK-NEXT: selp.b32 %r15, -1, 0, %p1; ; CHECK-NEXT: prmt.b32 %r16, %r15, %r14, 0x3340U; ; CHECK-NEXT: prmt.b32 %r17, %r16, %r13, 0x5410U; -; CHECK-NEXT: st.u32 [%rd3], %r17; +; CHECK-NEXT: st.b32 [%rd3], %r17; ; CHECK-NEXT: ret; entry: %t1 = load <4 x i8>, ptr %a, align 4 diff --git a/llvm/test/CodeGen/NVPTX/idioms.ll b/llvm/test/CodeGen/NVPTX/idioms.ll index efd61f905dab..d41b9b997335 100644 --- a/llvm/test/CodeGen/NVPTX/idioms.ll +++ b/llvm/test/CodeGen/NVPTX/idioms.ll @@ -40,7 +40,7 @@ define %struct.S16 @i32_to_2xi16(i32 noundef %in) { %low = trunc i32 %in to i16 %high32 = lshr i32 %in, 16 %high = trunc i32 %high32 to i16 -; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_param_0]; +; CHECK: ld.param.b32 %[[R32:r[0-9]+]], [i32_to_2xi16_param_0]; ; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]]; ; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]]; %s1 = insertvalue %struct.S16 poison, i16 %low, 0 @@ -54,7 +54,7 @@ define %struct.S16 @i32_to_2xi16_lh(i32 noundef %in) { %high32 = lshr i32 %in, 16 %high = trunc i32 %high32 to i16 %low = trunc i32 %in to i16 -; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0]; +; CHECK: ld.param.b32 %[[R32:r[0-9]+]], [i32_to_2xi16_lh_param_0]; ; CHECK-DAG: cvt.u16.u32 %rs{{[0-9+]}}, %[[R32]]; ; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]]; %s1 = insertvalue %struct.S16 poison, i16 %low, 0 @@ -82,7 +82,7 @@ define %struct.S32 @i64_to_2xi32(i64 noundef %in) { %low = trunc i64 %in to i32 %high64 = lshr i64 %in, 32 %high = trunc i64 %high64 to i32 -; CHECK: ld.param.u64 %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0]; +; CHECK: ld.param.b64 %[[R64:rd[0-9]+]], [i64_to_2xi32_param_0]; ; CHECK-DAG: cvt.u32.u64 %r{{[0-9+]}}, %[[R64]]; ; CHECK-DAG mov.b64 {tmp, %r{{[0-9+]}}}, %[[R64]]; %s1 = insertvalue %struct.S32 poison, i32 %low, 0 @@ -112,7 +112,7 @@ define %struct.S16 @i32_to_2xi16_shr(i32 noundef %i){ %l = trunc i32 %i1 to i16 %h32 = ashr i32 %i1, 16 %h = trunc i32 %h32 to i16 -; CHECK: ld.param.u32 %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0]; +; CHECK: ld.param.b32 %[[R32:r[0-9]+]], [i32_to_2xi16_shr_param_0]; ; CHECK: shr.s32 %[[R32H:r[0-9]+]], %[[R32]], 16; ; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32]]; ; CHECK-DAG mov.b32 {tmp, %rs{{[0-9+]}}}, %[[R32H]]; diff --git a/llvm/test/CodeGen/NVPTX/indirect_byval.ll b/llvm/test/CodeGen/NVPTX/indirect_byval.ll index 4509fcfd1a9b..1341a04c939c 100644 --- a/llvm/test/CodeGen/NVPTX/indirect_byval.ll +++ b/llvm/test/CodeGen/NVPTX/indirect_byval.ll @@ -22,9 +22,9 @@ define internal i32 @foo() { ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot0; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.global.u64 %rd1, [ptr]; +; CHECK-NEXT: ld.global.b64 %rd1, [ptr]; ; CHECK-NEXT: add.u64 %rd3, %SPL, 1; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd3]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd3]; ; CHECK-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 1 .b8 param0[1]; @@ -65,9 +65,9 @@ define internal i32 @bar() { ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-NEXT: cvta.local.u64 %SP, %SPL; -; CHECK-NEXT: ld.global.u64 %rd1, [ptr]; +; CHECK-NEXT: ld.global.b64 %rd1, [ptr]; ; CHECK-NEXT: add.u64 %rd3, %SPL, 8; -; CHECK-NEXT: ld.local.u64 %rd4, [%rd3]; +; CHECK-NEXT: ld.local.b64 %rd4, [%rd3]; ; CHECK-NEXT: add.u64 %rd5, %SP, 0; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll index 67c074ca7315..5cfdbb7447ad 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test1.ll @@ -33,9 +33,9 @@ define void @test_b128_input_from_load(ptr nocapture readonly %data) { ; CHECK-NEXT: .reg .b128 %rq<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_b128_input_from_load_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_b128_input_from_load_param_0]; ; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; -; CHECK-NEXT: ld.global.v2.u64 {%rd4, %rd5}, [%rd3]; +; CHECK-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd3]; ; CHECK-NEXT: mov.b64 %rd6, value; ; CHECK-NEXT: cvta.global.u64 %rd1, %rd6; ; CHECK-NEXT: mov.b128 %rq1, {%rd4, %rd5}; @@ -59,9 +59,9 @@ define void @test_b128_input_from_select(ptr nocapture readonly %flag) { ; CHECK-NEXT: .reg .b128 %rq<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd2, [test_b128_input_from_select_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_b128_input_from_select_param_0]; ; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; -; CHECK-NEXT: ld.global.u8 %rs1, [%rd3]; +; CHECK-NEXT: ld.global.b8 %rs1, [%rd3]; ; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; ; CHECK-NEXT: selp.b64 %rd4, 24, 42, %p1; ; CHECK-NEXT: mov.b64 %rd5, 0; @@ -93,7 +93,7 @@ define void @test_store_b128_output() { ; CHECK-NEXT: mov.b128 {%rd1, %rd2}, %rq1; ; CHECK-NEXT: add.cc.s64 %rd3, %rd1, 1; ; CHECK-NEXT: addc.cc.s64 %rd4, %rd2, 0; -; CHECK-NEXT: st.global.v2.u64 [value], {%rd3, %rd4}; +; CHECK-NEXT: st.global.v2.b64 [value], {%rd3, %rd4}; ; CHECK-NEXT: ret; %1 = tail call i128 asm "{ mov.b128 $0, 41; }", "=q"() %add = add nsw i128 %1, 1 @@ -109,9 +109,9 @@ define void @test_use_of_b128_output(ptr nocapture readonly %data) { ; CHECK-NEXT: .reg .b128 %rq<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_use_of_b128_output_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_use_of_b128_output_param_0]; ; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; -; CHECK-NEXT: ld.global.v2.u64 {%rd3, %rd4}, [%rd2]; +; CHECK-NEXT: ld.global.v2.b64 {%rd3, %rd4}, [%rd2]; ; CHECK-NEXT: mov.b128 %rq2, {%rd3, %rd4}; ; CHECK-NEXT: // begin inline asm ; CHECK-NEXT: { mov.b128 %rq1, %rq2; } @@ -119,7 +119,7 @@ define void @test_use_of_b128_output(ptr nocapture readonly %data) { ; CHECK-NEXT: mov.b128 {%rd5, %rd6}, %rq1; ; CHECK-NEXT: add.cc.s64 %rd7, %rd5, 1; ; CHECK-NEXT: addc.cc.s64 %rd8, %rd6, 0; -; CHECK-NEXT: st.global.v2.u64 [value], {%rd7, %rd8}; +; CHECK-NEXT: st.global.v2.b64 [value], {%rd7, %rd8}; ; CHECK-NEXT: ret; %1 = addrspacecast ptr %data to ptr addrspace(1) %2 = load <2 x i64>, ptr addrspace(1) %1, align 16 diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll index 8ca863bba5f4..52bd51b3ef7f 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test2.ll @@ -21,7 +21,7 @@ define void @test_corner_values() { ; CHECK-NEXT: .reg .b128 %rq<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.u64 %rd1, [v64]; +; CHECK-NEXT: ld.global.b64 %rd1, [v64]; ; CHECK-NEXT: add.s64 %rd2, %rd1, 8; ; CHECK-NEXT: mov.b64 %rd13, -1; ; CHECK-NEXT: mov.b128 %rq1, {%rd13, %rd13}; @@ -37,7 +37,7 @@ define void @test_corner_values() { ; CHECK-NEXT: st.b128 [%rd3], %rq1; ; CHECK-NEXT: } ; CHECK-NEXT: // end inline asm -; CHECK-NEXT: ld.global.u64 %rd15, [v64]; +; CHECK-NEXT: ld.global.b64 %rd15, [v64]; ; CHECK-NEXT: add.s64 %rd4, %rd15, 16; ; CHECK-NEXT: add.s64 %rd5, %rd15, 24; ; CHECK-NEXT: mov.b64 %rd16, 9223372036854775807; @@ -54,7 +54,7 @@ define void @test_corner_values() { ; CHECK-NEXT: st.b128 [%rd6], %rq2; ; CHECK-NEXT: } ; CHECK-NEXT: // end inline asm -; CHECK-NEXT: ld.global.u64 %rd18, [v64]; +; CHECK-NEXT: ld.global.b64 %rd18, [v64]; ; CHECK-NEXT: add.s64 %rd7, %rd18, 32; ; CHECK-NEXT: add.s64 %rd8, %rd18, 40; ; CHECK-NEXT: mov.b64 %rd19, -9223372036854775808; @@ -72,7 +72,7 @@ define void @test_corner_values() { ; CHECK-NEXT: st.b128 [%rd9], %rq3; ; CHECK-NEXT: } ; CHECK-NEXT: // end inline asm -; CHECK-NEXT: ld.global.u64 %rd22, [v64]; +; CHECK-NEXT: ld.global.b64 %rd22, [v64]; ; CHECK-NEXT: add.s64 %rd10, %rd22, 48; ; CHECK-NEXT: add.s64 %rd11, %rd22, 56; ; CHECK-NEXT: mov.b128 %rq4, {%rd20, %rd20}; diff --git a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll index 4ea31dd52a32..6dbf44f38aa2 100644 --- a/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll +++ b/llvm/test/CodeGen/NVPTX/inline-asm-b128-test3.ll @@ -19,7 +19,7 @@ define void @test_b128_in_loop() { ; CHECK-NEXT: setp.eq.s64 %p1, %rd1, 0; ; CHECK-NEXT: @%p1 bra $L__BB0_3; ; CHECK-NEXT: // %bb.1: // %BB1 -; CHECK-NEXT: ld.global.v2.u64 {%rd12, %rd13}, [x]; +; CHECK-NEXT: ld.global.v2.b64 {%rd12, %rd13}, [x]; ; CHECK-NEXT: mov.b64 %rd14, 0; ; CHECK-NEXT: $L__BB0_2: // %BB2 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 @@ -34,7 +34,7 @@ define void @test_b128_in_loop() { ; CHECK-NEXT: } ; CHECK-NEXT: // end inline asm ; CHECK-NEXT: mov.b128 {%rd12, %rd13}, %rq1; -; CHECK-NEXT: st.global.v2.u64 [x], {%rd12, %rd13}; +; CHECK-NEXT: st.global.v2.b64 [x], {%rd12, %rd13}; ; CHECK-NEXT: add.s64 %rd14, %rd14, 1; ; CHECK-NEXT: setp.ne.s64 %p2, %rd1, %rd14; ; CHECK-NEXT: @%p2 bra $L__BB0_2; diff --git a/llvm/test/CodeGen/NVPTX/intrinsics.ll b/llvm/test/CodeGen/NVPTX/intrinsics.ll index 01c51bb72d05..a8beeb287c22 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsics.ll @@ -10,9 +10,9 @@ define float @test_fabsf(float %f) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [test_fabsf_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [test_fabsf_param_0]; ; CHECK-NEXT: abs.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.fabs.f32(float %f) ret float %x @@ -24,9 +24,9 @@ define double @test_fabs(double %d) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [test_fabs_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [test_fabs_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %x = call double @llvm.fabs.f64(double %d) ret double %x @@ -38,9 +38,9 @@ define float @test_nvvm_sqrt(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [test_nvvm_sqrt_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [test_nvvm_sqrt_param_0]; ; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.sqrt.f(float %a) ret float %val @@ -52,9 +52,9 @@ define float @test_llvm_sqrt(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [test_llvm_sqrt_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [test_llvm_sqrt_param_0]; ; CHECK-NEXT: sqrt.rn.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.sqrt.f32(float %a) ret float %val @@ -66,7 +66,7 @@ define i32 @test_bitreverse32(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_bitreverse32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_bitreverse32_param_0]; ; CHECK-NEXT: brev.b32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -80,7 +80,7 @@ define i64 @test_bitreverse64(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_bitreverse64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_bitreverse64_param_0]; ; CHECK-NEXT: brev.b64 %rd2, %rd1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; @@ -94,7 +94,7 @@ define i32 @test_popc32(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_popc32_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_popc32_param_0]; ; CHECK-NEXT: popc.b32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -109,7 +109,7 @@ define i64 @test_popc64(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_popc64_param_0]; ; CHECK-NEXT: popc.b64 %r1, %rd1; ; CHECK-NEXT: cvt.u64.u32 %rd2, %r1; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; @@ -128,7 +128,7 @@ define i32 @test_popc64_trunc(i64 %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_popc64_trunc_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_popc64_trunc_param_0]; ; CHECK-NEXT: popc.b64 %r1, %rd1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -145,10 +145,10 @@ define void @test_popc16(i16 %a, ptr %b) { ; CHECK32-NEXT: .reg .b32 %r<4>; ; CHECK32-EMPTY: ; CHECK32-NEXT: // %bb.0: -; CHECK32-NEXT: ld.param.u16 %r1, [test_popc16_param_0]; +; CHECK32-NEXT: ld.param.b16 %r1, [test_popc16_param_0]; ; CHECK32-NEXT: popc.b32 %r2, %r1; -; CHECK32-NEXT: ld.param.u32 %r3, [test_popc16_param_1]; -; CHECK32-NEXT: st.u16 [%r3], %r2; +; CHECK32-NEXT: ld.param.b32 %r3, [test_popc16_param_1]; +; CHECK32-NEXT: st.b16 [%r3], %r2; ; CHECK32-NEXT: ret; ; ; CHECK64-LABEL: test_popc16( @@ -157,10 +157,10 @@ define void @test_popc16(i16 %a, ptr %b) { ; CHECK64-NEXT: .reg .b64 %rd<2>; ; CHECK64-EMPTY: ; CHECK64-NEXT: // %bb.0: -; CHECK64-NEXT: ld.param.u16 %r1, [test_popc16_param_0]; +; CHECK64-NEXT: ld.param.b16 %r1, [test_popc16_param_0]; ; CHECK64-NEXT: popc.b32 %r2, %r1; -; CHECK64-NEXT: ld.param.u64 %rd1, [test_popc16_param_1]; -; CHECK64-NEXT: st.u16 [%rd1], %r2; +; CHECK64-NEXT: ld.param.b64 %rd1, [test_popc16_param_1]; +; CHECK64-NEXT: st.b16 [%rd1], %r2; ; CHECK64-NEXT: ret; %val = call i16 @llvm.ctpop.i16(i16 %a) store i16 %val, ptr %b @@ -175,7 +175,7 @@ define i32 @test_popc16_to_32(i16 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %r1, [test_popc16_to_32_param_0]; +; CHECK-NEXT: ld.param.b16 %r1, [test_popc16_to_32_param_0]; ; CHECK-NEXT: popc.b32 %r2, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/jump-table.ll b/llvm/test/CodeGen/NVPTX/jump-table.ll index dbd4f8a55fac..0718e6d603b6 100644 --- a/llvm/test/CodeGen/NVPTX/jump-table.ll +++ b/llvm/test/CodeGen/NVPTX/jump-table.ll @@ -13,7 +13,7 @@ define void @foo(i32 %i) { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u32 %r2, [foo_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [foo_param_0]; ; CHECK-NEXT: setp.gt.u32 %p1, %r2, 3; ; CHECK-NEXT: @%p1 bra $L__BB0_6; ; CHECK-NEXT: // %bb.1: // %entry @@ -25,19 +25,19 @@ define void @foo(i32 %i) { ; CHECK-NEXT: brx.idx %r2, $L_brx_0; ; CHECK-NEXT: $L__BB0_2: // %case0 ; CHECK-NEXT: mov.b32 %r6, 0; -; CHECK-NEXT: st.global.u32 [out], %r6; +; CHECK-NEXT: st.global.b32 [out], %r6; ; CHECK-NEXT: bra.uni $L__BB0_6; ; CHECK-NEXT: $L__BB0_4: // %case2 ; CHECK-NEXT: mov.b32 %r4, 2; -; CHECK-NEXT: st.global.u32 [out], %r4; +; CHECK-NEXT: st.global.b32 [out], %r4; ; CHECK-NEXT: bra.uni $L__BB0_6; ; CHECK-NEXT: $L__BB0_5: // %case3 ; CHECK-NEXT: mov.b32 %r3, 3; -; CHECK-NEXT: st.global.u32 [out], %r3; +; CHECK-NEXT: st.global.b32 [out], %r3; ; CHECK-NEXT: bra.uni $L__BB0_6; ; CHECK-NEXT: $L__BB0_3: // %case1 ; CHECK-NEXT: mov.b32 %r5, 1; -; CHECK-NEXT: st.global.u32 [out], %r5; +; CHECK-NEXT: st.global.b32 [out], %r5; ; CHECK-NEXT: $L__BB0_6: // %end ; CHECK-NEXT: ret; entry: @@ -76,7 +76,7 @@ define i32 @test2(i32 %tmp158) { ; CHECK-NEXT: .reg .b32 %r<10>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u32 %r1, [test2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test2_param_0]; ; CHECK-NEXT: setp.gt.s32 %p1, %r1, 119; ; CHECK-NEXT: @%p1 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %entry diff --git a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll index ec20fda67ec0..24071b48143f 100644 --- a/llvm/test/CodeGen/NVPTX/ld-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/ld-addrspace.ll @@ -9,24 +9,24 @@ ;; i8 define i8 @ld_global_i8(ptr addrspace(1) %ptr) { ; ALL-LABEL: ld_global_i8 -; G32: ld.global.u8 %{{.*}}, [%r{{[0-9]+}}] -; G64: ld.global.u8 %{{.*}}, [%rd{{[0-9]+}}] +; G32: ld.global.b8 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.b8 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i8, ptr addrspace(1) %ptr ret i8 %a } define i8 @ld_shared_i8(ptr addrspace(3) %ptr) { ; ALL-LABEL: ld_shared_i8 -; LS32: ld.shared.u8 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.shared.u8 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.shared.b8 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.b8 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i8, ptr addrspace(3) %ptr ret i8 %a } define i8 @ld_local_i8(ptr addrspace(5) %ptr) { ; ALL-LABEL: ld_local_i8 -; LS32: ld.local.u8 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.local.u8 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.local.b8 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.b8 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i8, ptr addrspace(5) %ptr ret i8 %a @@ -35,24 +35,24 @@ define i8 @ld_local_i8(ptr addrspace(5) %ptr) { ;; i16 define i16 @ld_global_i16(ptr addrspace(1) %ptr) { ; ALL-LABEL: ld_global_i16 -; G32: ld.global.u16 %{{.*}}, [%r{{[0-9]+}}] -; G64: ld.global.u16 %{{.*}}, [%rd{{[0-9]+}}] +; G32: ld.global.b16 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.b16 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i16, ptr addrspace(1) %ptr ret i16 %a } define i16 @ld_shared_i16(ptr addrspace(3) %ptr) { ; ALL-LABEL: ld_shared_i16 -; LS32: ld.shared.u16 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.shared.u16 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.shared.b16 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.b16 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i16, ptr addrspace(3) %ptr ret i16 %a } define i16 @ld_local_i16(ptr addrspace(5) %ptr) { ; ALL-LABEL: ld_local_i16 -; LS32: ld.local.u16 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.local.u16 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.local.b16 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.b16 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i16, ptr addrspace(5) %ptr ret i16 %a @@ -61,24 +61,24 @@ define i16 @ld_local_i16(ptr addrspace(5) %ptr) { ;; i32 define i32 @ld_global_i32(ptr addrspace(1) %ptr) { ; ALL-LABEL: ld_global_i32 -; G32: ld.global.u32 %{{.*}}, [%r{{[0-9]+}}] -; G64: ld.global.u32 %{{.*}}, [%rd{{[0-9]+}}] +; G32: ld.global.b32 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.b32 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i32, ptr addrspace(1) %ptr ret i32 %a } define i32 @ld_shared_i32(ptr addrspace(3) %ptr) { ; ALL-LABEL: ld_shared_i32 -; LS32: ld.shared.u32 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.shared.u32 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.shared.b32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.b32 %{{.*}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32, ptr addrspace(3) %ptr ret i32 %a } define i32 @ld_local_i32(ptr addrspace(5) %ptr) { ; ALL-LABEL: ld_local_i32 -; LS32: ld.local.u32 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.local.u32 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.local.b32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.b32 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i32, ptr addrspace(5) %ptr ret i32 %a @@ -87,24 +87,24 @@ define i32 @ld_local_i32(ptr addrspace(5) %ptr) { ;; i64 define i64 @ld_global_i64(ptr addrspace(1) %ptr) { ; ALL-LABEL: ld_global_i64 -; G32: ld.global.u64 %{{.*}}, [%r{{[0-9]+}}] -; G64: ld.global.u64 %{{.*}}, [%rd{{[0-9]+}}] +; G32: ld.global.b64 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.b64 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i64, ptr addrspace(1) %ptr ret i64 %a } define i64 @ld_shared_i64(ptr addrspace(3) %ptr) { ; ALL-LABEL: ld_shared_i64 -; LS32: ld.shared.u64 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.shared.u64 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.shared.b64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.b64 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i64, ptr addrspace(3) %ptr ret i64 %a } define i64 @ld_local_i64(ptr addrspace(5) %ptr) { ; ALL-LABEL: ld_local_i64 -; LS32: ld.local.u64 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.local.u64 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.local.b64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.b64 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load i64, ptr addrspace(5) %ptr ret i64 %a @@ -113,24 +113,24 @@ define i64 @ld_local_i64(ptr addrspace(5) %ptr) { ;; f32 define float @ld_global_f32(ptr addrspace(1) %ptr) { ; ALL-LABEL: ld_global_f32 -; G32: ld.global.f32 %{{.*}}, [%r{{[0-9]+}}] -; G64: ld.global.f32 %{{.*}}, [%rd{{[0-9]+}}] +; G32: ld.global.b32 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.b32 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load float, ptr addrspace(1) %ptr ret float %a } define float @ld_shared_f32(ptr addrspace(3) %ptr) { ; ALL-LABEL: ld_shared_f32 -; LS32: ld.shared.f32 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.shared.f32 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.shared.b32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.b32 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load float, ptr addrspace(3) %ptr ret float %a } define float @ld_local_f32(ptr addrspace(5) %ptr) { ; ALL-LABEL: ld_local_f32 -; LS32: ld.local.f32 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.local.f32 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.local.b32 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.b32 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load float, ptr addrspace(5) %ptr ret float %a @@ -139,24 +139,24 @@ define float @ld_local_f32(ptr addrspace(5) %ptr) { ;; f64 define double @ld_global_f64(ptr addrspace(1) %ptr) { ; ALL-LABEL: ld_global_f64 -; G32: ld.global.f64 %{{.*}}, [%r{{[0-9]+}}] -; G64: ld.global.f64 %{{.*}}, [%rd{{[0-9]+}}] +; G32: ld.global.b64 %{{.*}}, [%r{{[0-9]+}}] +; G64: ld.global.b64 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load double, ptr addrspace(1) %ptr ret double %a } define double @ld_shared_f64(ptr addrspace(3) %ptr) { ; ALL-LABEL: ld_shared_f64 -; LS32: ld.shared.f64 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.shared.f64 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.shared.b64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.shared.b64 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load double, ptr addrspace(3) %ptr ret double %a } define double @ld_local_f64(ptr addrspace(5) %ptr) { ; ALL-LABEL: ld_local_f64 -; LS32: ld.local.f64 %{{.*}}, [%r{{[0-9]+}}] -; LS64: ld.local.f64 %{{.*}}, [%rd{{[0-9]+}}] +; LS32: ld.local.b64 %{{.*}}, [%r{{[0-9]+}}] +; LS64: ld.local.b64 %{{.*}}, [%rd{{[0-9]+}}] ; ALL: ret %a = load double, ptr addrspace(5) %ptr ret double %a diff --git a/llvm/test/CodeGen/NVPTX/ld-generic.ll b/llvm/test/CodeGen/NVPTX/ld-generic.ll index cfc4491ded1e..ce922dd8a5ac 100644 --- a/llvm/test/CodeGen/NVPTX/ld-generic.ll +++ b/llvm/test/CodeGen/NVPTX/ld-generic.ll @@ -6,9 +6,9 @@ ;; i8 define i8 @ld_global_i8(ptr addrspace(0) %ptr) { -; PTX32: ld.u8 %r{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.b8 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.b8 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i8, ptr addrspace(0) %ptr ret i8 %a @@ -16,9 +16,9 @@ define i8 @ld_global_i8(ptr addrspace(0) %ptr) { ;; i16 define i16 @ld_global_i16(ptr addrspace(0) %ptr) { -; PTX32: ld.u16 %r{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.b16 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.b16 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i16, ptr addrspace(0) %ptr ret i16 %a @@ -26,9 +26,9 @@ define i16 @ld_global_i16(ptr addrspace(0) %ptr) { ;; i32 define i32 @ld_global_i32(ptr addrspace(0) %ptr) { -; PTX32: ld.u32 %r{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.b32 %r{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i32, ptr addrspace(0) %ptr ret i32 %a @@ -36,9 +36,9 @@ define i32 @ld_global_i32(ptr addrspace(0) %ptr) { ;; i64 define i64 @ld_global_i64(ptr addrspace(0) %ptr) { -; PTX32: ld.u64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.b64 %rd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load i64, ptr addrspace(0) %ptr ret i64 %a @@ -46,9 +46,9 @@ define i64 @ld_global_i64(ptr addrspace(0) %ptr) { ;; f32 define float @ld_global_f32(ptr addrspace(0) %ptr) { -; PTX32: ld.f32 %f{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.b32 %f{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load float, ptr addrspace(0) %ptr ret float %a @@ -56,9 +56,9 @@ define float @ld_global_f32(ptr addrspace(0) %ptr) { ;; f64 define double @ld_global_f64(ptr addrspace(0) %ptr) { -; PTX32: ld.f64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.b64 %fd{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: ret -; PTX64: ld.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: ret %a = load double, ptr addrspace(0) %ptr ret double %a diff --git a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py index 3f2ec2606e5e..3f0c6b029125 100644 --- a/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py +++ b/llvm/test/CodeGen/NVPTX/ld-st-addrrspace.py @@ -12,6 +12,18 @@ from __future__ import print_function from itertools import product from string import Template + +llvm_type_to_ptx_load_type = { + "i8": "b8", + "i16": "b16", + "i32": "b32", + "i64": "b64", + "half": "b16", + "<2 x half>": "b32", + "float": "b32", + "double": "b64", +} + llvm_type_to_ptx_type = { "i8": "u8", "i16": "u16", @@ -48,8 +60,8 @@ def gen_load_tests(): load_template = """ define ${type} @${testname}(${type} addrspace(${asid})* %ptr) { ; CHECK: ${testname} -; CHECK_P32: ld${_volatile}${_volatile_as}.${ptx_type} %${ptx_reg}{{[0-9]+}}, [%r{{[0-9]+}}] -; CHECK_P64: ld${_volatile}${_volatile_as}.${ptx_type} %${ptx_reg}{{[0-9]+}}, [%rd{{[0-9]+}}] +; CHECK_P32: ld${_volatile}${_volatile_as}.${ptx_load_type} %${ptx_reg}{{[0-9]+}}, [%r{{[0-9]+}}] +; CHECK_P64: ld${_volatile}${_volatile_as}.${ptx_load_type} %${ptx_reg}{{[0-9]+}}, [%rd{{[0-9]+}}] ; CHECK: ret %p = ${generic_ptr} %a = load ${volatile} ${type}, ${type}* %p @@ -81,6 +93,7 @@ define ${type} @${testname}(${type} addrspace(${asid})* %ptr) { "_space": space, "ptx_reg": llvm_type_to_ptx_reg[op_type], "ptx_type": llvm_type_to_ptx_type[op_type], + "ptx_load_type": llvm_type_to_ptx_load_type[op_type], "asid": addrspace_id[space], } diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll index 2fe2d28320f0..3a342e4d838c 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll @@ -12,8 +12,8 @@ define i32 @ld_global(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_param_0]; -; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_param_0]; +; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a = load i32, ptr addrspace(1) %ptr, !invariant.load !0 @@ -31,8 +31,8 @@ define half @ld_global_v2f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f16_param_0]; -; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v2f16_param_0]; +; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; @@ -58,8 +58,8 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f16_param_0]; -; CHECK-NEXT: ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v4f16_param_0]; +; CHECK-NEXT: ld.global.nc.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; ; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; ; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; @@ -96,8 +96,8 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f16_param_0]; -; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v8f16_param_0]; +; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; } ; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; } @@ -135,8 +135,8 @@ define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8i8_param_0]; -; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v8i8_param_0]; +; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; ; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; @@ -171,8 +171,8 @@ define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v16i8_param_0]; -; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v16i8_param_0]; +; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; ; CHECK-NEXT: bfe.u32 %r6, %r4, 0, 8; @@ -226,8 +226,8 @@ define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2i32_param_0]; -; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v2i32_param_0]; +; CHECK-NEXT: ld.global.nc.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -245,8 +245,8 @@ define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4i32_param_0]; -; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_global_v4i32_param_0]; +; CHECK-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r1, %r2; ; CHECK-NEXT: add.s32 %r6, %r3, %r4; ; CHECK-NEXT: add.s32 %r7, %r5, %r6; @@ -270,8 +270,8 @@ define i32 @ld_not_invariant(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_invariant_param_0]; -; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_not_invariant_param_0]; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a = load i32, ptr addrspace(1) %ptr @@ -285,8 +285,8 @@ define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_global_addrspace_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [ld_not_global_addrspace_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %a = load i32, ptr addrspace(0) %ptr diff --git a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll index 12910f59f258..0a528f0e8da0 100644 --- a/llvm/test/CodeGen/NVPTX/ldparam-v4.ll +++ b/llvm/test/CodeGen/NVPTX/ldparam-v4.ll @@ -5,9 +5,9 @@ declare <4 x float> @bar() ; CHECK-LABEL: .func foo( define void @foo(ptr %ptr) { -; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [foo_param_0]; -; CHECK: ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0]; -; CHECK: st.v4.f32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} +; CHECK: ld.param.b64 %[[PTR:rd[0-9]+]], [foo_param_0]; +; CHECK: ld.param.v4.b32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0]; +; CHECK: st.v4.b32 [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]} %val = tail call <4 x float> @bar() store <4 x float> %val, ptr %ptr ret void diff --git a/llvm/test/CodeGen/NVPTX/ldu-i8.ll b/llvm/test/CodeGen/NVPTX/ldu-i8.ll index 93f3326b70bf..89f23f30f34e 100644 --- a/llvm/test/CodeGen/NVPTX/ldu-i8.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-i8.ll @@ -7,7 +7,7 @@ declare i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr, i32) define i8 @foo(ptr %a) { ; Ensure we properly truncate off the high-order 24 bits -; CHECK: ldu.global.u8 +; CHECK: ldu.global.b8 ; CHECK: cvt.u32.u16 ; CHECK: and.b32 %r{{[0-9]+}}, %r{{[0-9]+}}, 255 %val = tail call i8 @llvm.nvvm.ldu.global.i.i8.p0(ptr %a, i32 4) diff --git a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll index 2c1550aa082f..be2e896f5700 100644 --- a/llvm/test/CodeGen/NVPTX/ldu-ldg.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-ldg.ll @@ -31,8 +31,8 @@ define i8 @test_ldu_i8(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_i8_param_0]; -; CHECK-NEXT: ldu.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_i8_param_0]; +; CHECK-NEXT: ldu.global.b8 %rs1, [%rd1]; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: and.b32 %r2, %r1, 255; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; @@ -49,8 +49,8 @@ define i16 @test_ldu_i16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_i16_param_0]; -; CHECK-NEXT: ldu.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_i16_param_0]; +; CHECK-NEXT: ldu.global.b16 %rs1, [%rd1]; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -65,8 +65,8 @@ define i32 @test_ldu_i32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_i32_param_0]; -; CHECK-NEXT: ldu.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_i32_param_0]; +; CHECK-NEXT: ldu.global.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) @@ -79,8 +79,8 @@ define i64 @test_ldu_i64(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_i64_param_0]; -; CHECK-NEXT: ldu.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_i64_param_0]; +; CHECK-NEXT: ldu.global.b64 %rd2, [%rd1]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.ldu.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8) @@ -93,8 +93,8 @@ define ptr @test_ldu_p(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_p_param_0]; -; CHECK-NEXT: ldu.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_p_param_0]; +; CHECK-NEXT: ldu.global.b64 %rd2, [%rd1]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call ptr @llvm.nvvm.ldu.global.p.p1(ptr addrspace(1) %ptr, i32 8) @@ -108,9 +108,9 @@ define float @test_ldu_f32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_f32_param_0]; -; CHECK-NEXT: ldu.global.f32 %f1, [%rd1]; -; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_f32_param_0]; +; CHECK-NEXT: ldu.global.b32 %f1, [%rd1]; +; CHECK-NEXT: st.param.b32 [func_retval0], %f1; ; CHECK-NEXT: ret; %val = tail call float @llvm.nvvm.ldu.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4) ret float %val @@ -123,9 +123,9 @@ define double @test_ldu_f64(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_f64_param_0]; -; CHECK-NEXT: ldu.global.f64 %fd1, [%rd1]; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd1; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_f64_param_0]; +; CHECK-NEXT: ldu.global.b64 %fd1, [%rd1]; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd1; ; CHECK-NEXT: ret; %val = tail call double @llvm.nvvm.ldu.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8) ret double %val @@ -138,8 +138,8 @@ define half @test_ldu_f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_f16_param_0]; -; CHECK-NEXT: ldu.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_f16_param_0]; +; CHECK-NEXT: ldu.global.b16 %rs1, [%rd1]; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %val = tail call half @llvm.nvvm.ldu.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2) @@ -153,8 +153,8 @@ define <2 x half> @test_ldu_v2f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldu_v2f16_param_0]; -; CHECK-NEXT: ldu.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldu_v2f16_param_0]; +; CHECK-NEXT: ldu.global.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call <2 x half> @llvm.nvvm.ldu.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4) @@ -169,8 +169,8 @@ define i8 @test_ldg_i8(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_i8_param_0]; -; CHECK-NEXT: ld.global.nc.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i8_param_0]; +; CHECK-NEXT: ld.global.nc.b8 %rs1, [%rd1]; ; CHECK-NEXT: cvt.u32.u8 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -186,8 +186,8 @@ define i16 @test_ldg_i16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_i16_param_0]; -; CHECK-NEXT: ld.global.nc.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i16_param_0]; +; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1]; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -202,8 +202,8 @@ define i32 @test_ldg_i32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_i32_param_0]; -; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i32_param_0]; +; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) %ptr, i32 4) @@ -216,8 +216,8 @@ define i64 @test_ldg_i64(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_i64_param_0]; -; CHECK-NEXT: ld.global.nc.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_i64_param_0]; +; CHECK-NEXT: ld.global.nc.b64 %rd2, [%rd1]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call i64 @llvm.nvvm.ldg.global.i.i64.p1(ptr addrspace(1) %ptr, i32 8) @@ -230,8 +230,8 @@ define ptr @test_ldg_p(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_p_param_0]; -; CHECK-NEXT: ld.global.nc.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_p_param_0]; +; CHECK-NEXT: ld.global.nc.b64 %rd2, [%rd1]; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd2; ; CHECK-NEXT: ret; %val = tail call ptr @llvm.nvvm.ldg.global.p.p1(ptr addrspace(1) %ptr, i32 8) @@ -245,9 +245,9 @@ define float @test_ldg_f32(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_f32_param_0]; -; CHECK-NEXT: ld.global.nc.f32 %f1, [%rd1]; -; CHECK-NEXT: st.param.f32 [func_retval0], %f1; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_f32_param_0]; +; CHECK-NEXT: ld.global.nc.b32 %f1, [%rd1]; +; CHECK-NEXT: st.param.b32 [func_retval0], %f1; ; CHECK-NEXT: ret; %val = tail call float @llvm.nvvm.ldg.global.f.f32.p1(ptr addrspace(1) %ptr, i32 4) ret float %val @@ -260,9 +260,9 @@ define double @test_ldg_f64(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_f64_param_0]; -; CHECK-NEXT: ld.global.nc.f64 %fd1, [%rd1]; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd1; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_f64_param_0]; +; CHECK-NEXT: ld.global.nc.b64 %fd1, [%rd1]; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd1; ; CHECK-NEXT: ret; %val = tail call double @llvm.nvvm.ldg.global.f.f64.p1(ptr addrspace(1) %ptr, i32 8) ret double %val @@ -275,8 +275,8 @@ define half @test_ldg_f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_f16_param_0]; -; CHECK-NEXT: ld.global.nc.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_f16_param_0]; +; CHECK-NEXT: ld.global.nc.b16 %rs1, [%rd1]; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs1; ; CHECK-NEXT: ret; %val = tail call half @llvm.nvvm.ldg.global.f.f16.p1(ptr addrspace(1) %ptr, i32 2) @@ -290,8 +290,8 @@ define <2 x half> @test_ldg_v2f16(ptr addrspace(1) %ptr) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_ldg_v2f16_param_0]; -; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_ldg_v2f16_param_0]; +; CHECK-NEXT: ld.global.nc.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p1(ptr addrspace(1) %ptr, i32 4) @@ -306,7 +306,7 @@ define i32 @test_ldg_asi() { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.global.nc.u32 %r1, [g+4]; +; CHECK-NEXT: ld.global.nc.b32 %r1, [g+4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call i32 @llvm.nvvm.ldg.global.i.i32.p1(ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g, i32 4), i32 4) @@ -319,7 +319,7 @@ define i32 @test_lug_asi() { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ldu.global.u32 %r1, [g+4]; +; CHECK-NEXT: ldu.global.b32 %r1, [g+4]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; %val = tail call i32 @llvm.nvvm.ldu.global.i.i32.p1(ptr addrspace(1) getelementptr (i8, ptr addrspace(1) @g, i32 4), i32 4) diff --git a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll index 8b740117c55e..f1f6be9750fb 100644 --- a/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll +++ b/llvm/test/CodeGen/NVPTX/ldu-reg-plus-offset.ll @@ -5,8 +5,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define void @reg_plus_offset(ptr %a) { -; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}+32]; -; CHECK: ldu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}+36]; +; CHECK: ldu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}+32]; +; CHECK: ldu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}+36]; %p2 = getelementptr i32, ptr %a, i32 8 %t1 = call i32 @llvm.nvvm.ldu.global.i.i32.p0(ptr %p2, i32 4) %p3 = getelementptr i32, ptr %a, i32 9 diff --git a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll index fd1492414bf8..5952097f4cd2 100644 --- a/llvm/test/CodeGen/NVPTX/load-sext-i1.ll +++ b/llvm/test/CodeGen/NVPTX/load-sext-i1.ll @@ -5,8 +5,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "nvptx-nvidia-cuda" define void @main(ptr %a1, i32 %a2, ptr %arg3) { -; CHECK: ld.u8 -; CHECK-NOT: ld.u1 +; CHECK: ld.b8 +; CHECK-NOT: ld.b1 %t1 = getelementptr i1, ptr %a1, i32 %a2 %t2 = load i1, ptr %t1 %t3 = sext i1 %t2 to i32 diff --git a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll index cb2e247bd78c..468e19492bfd 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-scalars.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll @@ -24,10 +24,10 @@ define void @generic_i8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_i8_param_0]; -; CHECK-NEXT: ld.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_i8_param_0]; +; CHECK-NEXT: ld.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.u8 [%rd1], %rs2; +; CHECK-NEXT: st.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i8, ptr %a %a.add = add i8 %a.load, 1 @@ -42,10 +42,10 @@ define void @generic_i16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_i16_param_0]; -; CHECK-NEXT: ld.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_i16_param_0]; +; CHECK-NEXT: ld.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.u16 [%rd1], %rs2; +; CHECK-NEXT: st.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i16, ptr %a %a.add = add i16 %a.load, 1 @@ -60,10 +60,10 @@ define void @generic_i32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_i32_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_i32_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: st.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load i32, ptr %a %a.add = add i32 %a.load, 1 @@ -77,10 +77,10 @@ define void @generic_i64(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_i64_param_0]; -; CHECK-NEXT: ld.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_i64_param_0]; +; CHECK-NEXT: ld.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.u64 [%rd1], %rd3; +; CHECK-NEXT: st.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load i64, ptr %a %a.add = add i64 %a.load, 1 @@ -95,10 +95,10 @@ define void @generic_float(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_float_param_0]; -; CHECK-NEXT: ld.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_float_param_0]; +; CHECK-NEXT: ld.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.f32 [%rd1], %f2; +; CHECK-NEXT: st.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load float, ptr %a %a.add = fadd float %a.load, 1. @@ -113,10 +113,10 @@ define void @generic_double(ptr %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_double_param_0]; -; CHECK-NEXT: ld.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_double_param_0]; +; CHECK-NEXT: ld.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.f64 [%rd1], %fd2; +; CHECK-NEXT: st.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load double, ptr %a %a.add = fadd double %a.load, 1. @@ -133,10 +133,10 @@ define void @generic_volatile_i8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i8_param_0]; -; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i8, ptr %a %a.add = add i8 %a.load, 1 @@ -151,10 +151,10 @@ define void @generic_volatile_i16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i16_param_0]; -; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i16, ptr %a %a.add = add i16 %a.load, 1 @@ -169,10 +169,10 @@ define void @generic_volatile_i32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i32_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile i32, ptr %a %a.add = add i32 %a.load, 1 @@ -186,10 +186,10 @@ define void @generic_volatile_i64(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_i64_param_0]; -; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load volatile i64, ptr %a %a.add = add i64 %a.load, 1 @@ -204,10 +204,10 @@ define void @generic_volatile_float(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_float_param_0]; -; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load volatile float, ptr %a %a.add = fadd float %a.load, 1. @@ -222,10 +222,10 @@ define void @generic_volatile_double(ptr %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_double_param_0]; -; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load volatile double, ptr %a %a.add = fadd double %a.load, 1. @@ -242,10 +242,10 @@ define void @generic_unordered_sys_i8(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_unordered_sys_i8( @@ -254,10 +254,10 @@ define void @generic_unordered_sys_i8(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i8_param_0]; -; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -272,10 +272,10 @@ define void @generic_unordered_sys_i16(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_unordered_sys_i16( @@ -284,10 +284,10 @@ define void @generic_unordered_sys_i16(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i16_param_0]; -; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i16, ptr %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -302,10 +302,10 @@ define void @generic_unordered_sys_i32(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_unordered_sys_i32( @@ -314,10 +314,10 @@ define void @generic_unordered_sys_i32(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i32_param_0]; -; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2; +; SM70-NEXT: st.relaxed.sys.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic i32, ptr %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -331,10 +331,10 @@ define void @generic_unordered_sys_i64(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_unordered_sys_i64( @@ -342,10 +342,10 @@ define void @generic_unordered_sys_i64(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_i64_param_0]; -; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3; +; SM70-NEXT: st.relaxed.sys.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic i64, ptr %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -360,10 +360,10 @@ define void @generic_unordered_sys_float(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0]; -; SM60-NEXT: ld.volatile.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_unordered_sys_float( @@ -372,10 +372,10 @@ define void @generic_unordered_sys_float(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_float_param_0]; -; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2; +; SM70-NEXT: st.relaxed.sys.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic float, ptr %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -390,10 +390,10 @@ define void @generic_unordered_sys_double(ptr %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0]; -; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_unordered_sys_double( @@ -402,10 +402,10 @@ define void @generic_unordered_sys_double(ptr %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_unordered_sys_double_param_0]; -; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2; +; SM70-NEXT: st.relaxed.sys.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic double, ptr %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -422,10 +422,10 @@ define void @generic_unordered_volatile_sys_i8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i8_param_0]; -; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -440,10 +440,10 @@ define void @generic_unordered_volatile_sys_i16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i16_param_0]; -; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i16, ptr %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -458,10 +458,10 @@ define void @generic_unordered_volatile_sys_i32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i32_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i32, ptr %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -475,10 +475,10 @@ define void @generic_unordered_volatile_sys_i64(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_i64_param_0]; -; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic volatile i64, ptr %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -493,10 +493,10 @@ define void @generic_unordered_volatile_sys_float(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_float_param_0]; -; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic volatile float, ptr %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -511,10 +511,10 @@ define void @generic_unordered_volatile_sys_double(ptr %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_unordered_volatile_sys_double_param_0]; -; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic volatile double, ptr %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -531,10 +531,10 @@ define void @generic_monotonic_sys_i8(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_monotonic_sys_i8( @@ -543,10 +543,10 @@ define void @generic_monotonic_sys_i8(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i8_param_0]; -; SM70-NEXT: ld.relaxed.sys.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.u8 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -561,10 +561,10 @@ define void @generic_monotonic_sys_i16(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_monotonic_sys_i16( @@ -573,10 +573,10 @@ define void @generic_monotonic_sys_i16(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i16_param_0]; -; SM70-NEXT: ld.relaxed.sys.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.u16 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i16, ptr %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -591,10 +591,10 @@ define void @generic_monotonic_sys_i32(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_monotonic_sys_i32( @@ -603,10 +603,10 @@ define void @generic_monotonic_sys_i32(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i32_param_0]; -; SM70-NEXT: ld.relaxed.sys.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.relaxed.sys.u32 [%rd1], %r2; +; SM70-NEXT: st.relaxed.sys.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic i32, ptr %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -620,10 +620,10 @@ define void @generic_monotonic_sys_i64(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_monotonic_sys_i64( @@ -631,10 +631,10 @@ define void @generic_monotonic_sys_i64(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_i64_param_0]; -; SM70-NEXT: ld.relaxed.sys.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.relaxed.sys.u64 [%rd1], %rd3; +; SM70-NEXT: st.relaxed.sys.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic i64, ptr %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -649,10 +649,10 @@ define void @generic_monotonic_sys_float(ptr %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0]; -; SM60-NEXT: ld.volatile.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_monotonic_sys_float( @@ -661,10 +661,10 @@ define void @generic_monotonic_sys_float(ptr %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_float_param_0]; -; SM70-NEXT: ld.relaxed.sys.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.relaxed.sys.f32 [%rd1], %f2; +; SM70-NEXT: st.relaxed.sys.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic float, ptr %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -679,10 +679,10 @@ define void @generic_monotonic_sys_double(ptr %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0]; -; SM60-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: generic_monotonic_sys_double( @@ -691,10 +691,10 @@ define void @generic_monotonic_sys_double(ptr %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [generic_monotonic_sys_double_param_0]; -; SM70-NEXT: ld.relaxed.sys.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [generic_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.relaxed.sys.f64 [%rd1], %fd2; +; SM70-NEXT: st.relaxed.sys.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic double, ptr %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -711,10 +711,10 @@ define void @generic_monotonic_volatile_sys_i8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i8_param_0]; -; CHECK-NEXT: ld.volatile.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -729,10 +729,10 @@ define void @generic_monotonic_volatile_sys_i16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i16_param_0]; -; CHECK-NEXT: ld.volatile.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i16, ptr %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -747,10 +747,10 @@ define void @generic_monotonic_volatile_sys_i32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i32_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i32, ptr %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -764,10 +764,10 @@ define void @generic_monotonic_volatile_sys_i64(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_i64_param_0]; -; CHECK-NEXT: ld.volatile.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic volatile i64, ptr %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -782,10 +782,10 @@ define void @generic_monotonic_volatile_sys_float(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_float_param_0]; -; CHECK-NEXT: ld.volatile.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic volatile float, ptr %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -800,10 +800,10 @@ define void @generic_monotonic_volatile_sys_double(ptr %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_monotonic_volatile_sys_double_param_0]; -; CHECK-NEXT: ld.volatile.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic volatile double, ptr %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -822,10 +822,10 @@ define void @global_i8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_i8_param_0]; -; CHECK-NEXT: ld.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_i8_param_0]; +; CHECK-NEXT: ld.global.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.global.u8 [%rd1], %rs2; +; CHECK-NEXT: st.global.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 @@ -840,10 +840,10 @@ define void @global_i16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_i16_param_0]; -; CHECK-NEXT: ld.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_i16_param_0]; +; CHECK-NEXT: ld.global.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.global.u16 [%rd1], %rs2; +; CHECK-NEXT: st.global.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i16, ptr addrspace(1) %a %a.add = add i16 %a.load, 1 @@ -858,10 +858,10 @@ define void @global_i32(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_i32_param_0]; -; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_i32_param_0]; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: st.global.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load i32, ptr addrspace(1) %a %a.add = add i32 %a.load, 1 @@ -875,10 +875,10 @@ define void @global_i64(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_i64_param_0]; -; CHECK-NEXT: ld.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_i64_param_0]; +; CHECK-NEXT: ld.global.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.global.u64 [%rd1], %rd3; +; CHECK-NEXT: st.global.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load i64, ptr addrspace(1) %a %a.add = add i64 %a.load, 1 @@ -893,10 +893,10 @@ define void @global_float(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_float_param_0]; -; CHECK-NEXT: ld.global.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_float_param_0]; +; CHECK-NEXT: ld.global.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.global.f32 [%rd1], %f2; +; CHECK-NEXT: st.global.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load float, ptr addrspace(1) %a %a.add = fadd float %a.load, 1. @@ -911,10 +911,10 @@ define void @global_double(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_double_param_0]; -; CHECK-NEXT: ld.global.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_double_param_0]; +; CHECK-NEXT: ld.global.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.global.f64 [%rd1], %fd2; +; CHECK-NEXT: st.global.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load double, ptr addrspace(1) %a %a.add = fadd double %a.load, 1. @@ -931,10 +931,10 @@ define void @global_volatile_i8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i8_param_0]; -; CHECK-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.global.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.global.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(1) %a %a.add = add i8 %a.load, 1 @@ -949,10 +949,10 @@ define void @global_volatile_i16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i16_param_0]; -; CHECK-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.global.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.global.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i16, ptr addrspace(1) %a %a.add = add i16 %a.load, 1 @@ -967,10 +967,10 @@ define void @global_volatile_i32(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i32_param_0]; -; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.global.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile i32, ptr addrspace(1) %a %a.add = add i32 %a.load, 1 @@ -984,10 +984,10 @@ define void @global_volatile_i64(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_i64_param_0]; -; CHECK-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.global.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.global.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load volatile i64, ptr addrspace(1) %a %a.add = add i64 %a.load, 1 @@ -1002,10 +1002,10 @@ define void @global_volatile_float(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_float_param_0]; -; CHECK-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.global.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.global.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.global.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load volatile float, ptr addrspace(1) %a %a.add = fadd float %a.load, 1. @@ -1020,10 +1020,10 @@ define void @global_volatile_double(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_double_param_0]; -; CHECK-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.global.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.global.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load volatile double, ptr addrspace(1) %a %a.add = fadd double %a.load, 1. @@ -1040,10 +1040,10 @@ define void @global_unordered_sys_i8(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_sys_i8( @@ -1052,10 +1052,10 @@ define void @global_unordered_sys_i8(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i8_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.global.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -1070,10 +1070,10 @@ define void @global_unordered_sys_i16(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_sys_i16( @@ -1082,10 +1082,10 @@ define void @global_unordered_sys_i16(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i16_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.global.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i16, ptr addrspace(1) %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -1100,10 +1100,10 @@ define void @global_unordered_sys_i32(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_sys_i32( @@ -1112,10 +1112,10 @@ define void @global_unordered_sys_i32(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i32_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: st.relaxed.sys.global.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic i32, ptr addrspace(1) %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -1129,10 +1129,10 @@ define void @global_unordered_sys_i64(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_sys_i64( @@ -1140,10 +1140,10 @@ define void @global_unordered_sys_i64(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_i64_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: st.relaxed.sys.global.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic i64, ptr addrspace(1) %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -1158,10 +1158,10 @@ define void @global_unordered_sys_float(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0]; -; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_sys_float( @@ -1170,10 +1170,10 @@ define void @global_unordered_sys_float(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_float_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: st.relaxed.sys.global.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic float, ptr addrspace(1) %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -1188,10 +1188,10 @@ define void @global_unordered_sys_double(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0]; -; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_sys_double( @@ -1200,10 +1200,10 @@ define void @global_unordered_sys_double(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_sys_double_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: st.relaxed.sys.global.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic double, ptr addrspace(1) %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -1220,10 +1220,10 @@ define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_volatile_sys_i8( @@ -1232,10 +1232,10 @@ define void @global_unordered_volatile_sys_i8(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i8_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i8_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(1) %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -1250,10 +1250,10 @@ define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_volatile_sys_i16( @@ -1262,10 +1262,10 @@ define void @global_unordered_volatile_sys_i16(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i16_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i16_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic volatile i16, ptr addrspace(1) %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -1280,10 +1280,10 @@ define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_volatile_sys_i32( @@ -1292,10 +1292,10 @@ define void @global_unordered_volatile_sys_i32(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i32_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i32_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic volatile i32, ptr addrspace(1) %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -1309,10 +1309,10 @@ define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_volatile_sys_i64( @@ -1320,10 +1320,10 @@ define void @global_unordered_volatile_sys_i64(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_i64_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_i64_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: st.mmio.relaxed.sys.global.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic volatile i64, ptr addrspace(1) %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -1338,10 +1338,10 @@ define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0]; -; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_volatile_sys_float( @@ -1350,10 +1350,10 @@ define void @global_unordered_volatile_sys_float(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_float_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_float_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic volatile float, ptr addrspace(1) %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -1368,10 +1368,10 @@ define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0]; -; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_unordered_volatile_sys_double( @@ -1380,10 +1380,10 @@ define void @global_unordered_volatile_sys_double(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_unordered_volatile_sys_double_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_unordered_volatile_sys_double_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic volatile double, ptr addrspace(1) %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -1400,10 +1400,10 @@ define void @global_monotonic_sys_i8(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_sys_i8( @@ -1412,10 +1412,10 @@ define void @global_monotonic_sys_i8(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i8_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.global.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -1430,10 +1430,10 @@ define void @global_monotonic_sys_i16(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_sys_i16( @@ -1442,10 +1442,10 @@ define void @global_monotonic_sys_i16(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i16_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.global.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i16, ptr addrspace(1) %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -1460,10 +1460,10 @@ define void @global_monotonic_sys_i32(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_sys_i32( @@ -1472,10 +1472,10 @@ define void @global_monotonic_sys_i32(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i32_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: st.relaxed.sys.global.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic i32, ptr addrspace(1) %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -1489,10 +1489,10 @@ define void @global_monotonic_sys_i64(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_sys_i64( @@ -1500,10 +1500,10 @@ define void @global_monotonic_sys_i64(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_i64_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: st.relaxed.sys.global.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic i64, ptr addrspace(1) %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -1518,10 +1518,10 @@ define void @global_monotonic_sys_float(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0]; -; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_sys_float( @@ -1530,10 +1530,10 @@ define void @global_monotonic_sys_float(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_float_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: st.relaxed.sys.global.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic float, ptr addrspace(1) %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -1548,10 +1548,10 @@ define void @global_monotonic_sys_double(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0]; -; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_sys_double( @@ -1560,10 +1560,10 @@ define void @global_monotonic_sys_double(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_sys_double_param_0]; -; SM70-NEXT: ld.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.global.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: st.relaxed.sys.global.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic double, ptr addrspace(1) %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -1580,10 +1580,10 @@ define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.global.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.global.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_volatile_sys_i8( @@ -1592,10 +1592,10 @@ define void @global_monotonic_volatile_sys_i8(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i8_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u8 [%rd1], %rs2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(1) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -1610,10 +1610,10 @@ define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.global.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.global.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.global.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.global.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_volatile_sys_i16( @@ -1622,10 +1622,10 @@ define void @global_monotonic_volatile_sys_i16(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i16_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u16 [%rd1], %rs2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic volatile i16, ptr addrspace(1) %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -1640,10 +1640,10 @@ define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.global.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_volatile_sys_i32( @@ -1652,10 +1652,10 @@ define void @global_monotonic_volatile_sys_i32(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i32_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u32 [%rd1], %r2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic volatile i32, ptr addrspace(1) %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -1669,10 +1669,10 @@ define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.global.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.global.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_volatile_sys_i64( @@ -1680,10 +1680,10 @@ define void @global_monotonic_volatile_sys_i64(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_i64_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.mmio.relaxed.sys.global.u64 [%rd1], %rd3; +; SM70-NEXT: st.mmio.relaxed.sys.global.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic volatile i64, ptr addrspace(1) %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -1698,10 +1698,10 @@ define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0]; -; SM60-NEXT: ld.volatile.global.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_float_param_0]; +; SM60-NEXT: ld.volatile.global.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.global.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.global.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_volatile_sys_float( @@ -1710,10 +1710,10 @@ define void @global_monotonic_volatile_sys_float(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_float_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_float_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.mmio.relaxed.sys.global.f32 [%rd1], %f2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic volatile float, ptr addrspace(1) %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -1728,10 +1728,10 @@ define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0]; -; SM60-NEXT: ld.volatile.global.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_double_param_0]; +; SM60-NEXT: ld.volatile.global.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.global.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.global.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: global_monotonic_volatile_sys_double( @@ -1740,10 +1740,10 @@ define void @global_monotonic_volatile_sys_double(ptr addrspace(1) %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [global_monotonic_volatile_sys_double_param_0]; -; SM70-NEXT: ld.mmio.relaxed.sys.global.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [global_monotonic_volatile_sys_double_param_0]; +; SM70-NEXT: ld.mmio.relaxed.sys.global.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.mmio.relaxed.sys.global.f64 [%rd1], %fd2; +; SM70-NEXT: st.mmio.relaxed.sys.global.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic volatile double, ptr addrspace(1) %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -1762,10 +1762,10 @@ define void @shared_i8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_i8_param_0]; -; CHECK-NEXT: ld.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_i8_param_0]; +; CHECK-NEXT: ld.shared.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: st.shared.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 @@ -1780,10 +1780,10 @@ define void @shared_i16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_i16_param_0]; -; CHECK-NEXT: ld.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_i16_param_0]; +; CHECK-NEXT: ld.shared.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: st.shared.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i16, ptr addrspace(3) %a %a.add = add i16 %a.load, 1 @@ -1798,10 +1798,10 @@ define void @shared_i32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_i32_param_0]; -; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_i32_param_0]; +; CHECK-NEXT: ld.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: st.shared.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load i32, ptr addrspace(3) %a %a.add = add i32 %a.load, 1 @@ -1815,10 +1815,10 @@ define void @shared_i64(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_i64_param_0]; -; CHECK-NEXT: ld.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_i64_param_0]; +; CHECK-NEXT: ld.shared.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: st.shared.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load i64, ptr addrspace(3) %a %a.add = add i64 %a.load, 1 @@ -1833,10 +1833,10 @@ define void @shared_float(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_float_param_0]; -; CHECK-NEXT: ld.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_float_param_0]; +; CHECK-NEXT: ld.shared.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.shared.f32 [%rd1], %f2; +; CHECK-NEXT: st.shared.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load float, ptr addrspace(3) %a %a.add = fadd float %a.load, 1. @@ -1851,10 +1851,10 @@ define void @shared_double(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_double_param_0]; -; CHECK-NEXT: ld.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_double_param_0]; +; CHECK-NEXT: ld.shared.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: st.shared.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load double, ptr addrspace(3) %a %a.add = fadd double %a.load, 1. @@ -1871,10 +1871,10 @@ define void @shared_volatile_i8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i8_param_0]; -; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(3) %a %a.add = add i8 %a.load, 1 @@ -1889,10 +1889,10 @@ define void @shared_volatile_i16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i16_param_0]; -; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.shared.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i16, ptr addrspace(3) %a %a.add = add i16 %a.load, 1 @@ -1907,10 +1907,10 @@ define void @shared_volatile_i32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i32_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile i32, ptr addrspace(3) %a %a.add = add i32 %a.load, 1 @@ -1924,10 +1924,10 @@ define void @shared_volatile_i64(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_i64_param_0]; -; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.shared.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load volatile i64, ptr addrspace(3) %a %a.add = add i64 %a.load, 1 @@ -1942,10 +1942,10 @@ define void @shared_volatile_float(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_float_param_0]; -; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load volatile float, ptr addrspace(3) %a %a.add = fadd float %a.load, 1. @@ -1960,10 +1960,10 @@ define void @shared_volatile_double(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_double_param_0]; -; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.shared.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load volatile double, ptr addrspace(3) %a %a.add = fadd double %a.load, 1. @@ -1980,10 +1980,10 @@ define void @shared_unordered_sys_i8(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.shared.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.shared.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_unordered_sys_i8( @@ -1992,10 +1992,10 @@ define void @shared_unordered_sys_i8(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i8_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.shared.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -2010,10 +2010,10 @@ define void @shared_unordered_sys_i16(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.shared.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.shared.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_unordered_sys_i16( @@ -2022,10 +2022,10 @@ define void @shared_unordered_sys_i16(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i16_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.shared.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i16, ptr addrspace(3) %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -2040,10 +2040,10 @@ define void @shared_unordered_sys_i32(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.shared.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_unordered_sys_i32( @@ -2052,10 +2052,10 @@ define void @shared_unordered_sys_i32(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i32_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2; +; SM70-NEXT: st.relaxed.sys.shared.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic i32, ptr addrspace(3) %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -2069,10 +2069,10 @@ define void @shared_unordered_sys_i64(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.shared.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.shared.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_unordered_sys_i64( @@ -2080,10 +2080,10 @@ define void @shared_unordered_sys_i64(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_i64_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3; +; SM70-NEXT: st.relaxed.sys.shared.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic i64, ptr addrspace(3) %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -2098,10 +2098,10 @@ define void @shared_unordered_sys_float(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0]; -; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_float_param_0]; +; SM60-NEXT: ld.volatile.shared.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.shared.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_unordered_sys_float( @@ -2110,10 +2110,10 @@ define void @shared_unordered_sys_float(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_float_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2; +; SM70-NEXT: st.relaxed.sys.shared.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic float, ptr addrspace(3) %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -2128,10 +2128,10 @@ define void @shared_unordered_sys_double(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0]; -; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_double_param_0]; +; SM60-NEXT: ld.volatile.shared.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.shared.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_unordered_sys_double( @@ -2140,10 +2140,10 @@ define void @shared_unordered_sys_double(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_unordered_sys_double_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_unordered_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2; +; SM70-NEXT: st.relaxed.sys.shared.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic double, ptr addrspace(3) %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -2160,10 +2160,10 @@ define void @shared_unordered_volatile_sys_i8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i8_param_0]; -; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(3) %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -2178,10 +2178,10 @@ define void @shared_unordered_volatile_sys_i16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i16_param_0]; -; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.shared.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i16, ptr addrspace(3) %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -2196,10 +2196,10 @@ define void @shared_unordered_volatile_sys_i32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i32_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i32, ptr addrspace(3) %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -2213,10 +2213,10 @@ define void @shared_unordered_volatile_sys_i64(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_i64_param_0]; -; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.shared.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic volatile i64, ptr addrspace(3) %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -2231,10 +2231,10 @@ define void @shared_unordered_volatile_sys_float(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_float_param_0]; -; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic volatile float, ptr addrspace(3) %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -2249,10 +2249,10 @@ define void @shared_unordered_volatile_sys_double(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_unordered_volatile_sys_double_param_0]; -; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.shared.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic volatile double, ptr addrspace(3) %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -2269,10 +2269,10 @@ define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0]; -; SM60-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i8_param_0]; +; SM60-NEXT: ld.volatile.shared.b8 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; SM60-NEXT: st.volatile.shared.b8 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_monotonic_sys_i8( @@ -2281,10 +2281,10 @@ define void @shared_monotonic_sys_i8(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i8_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u8 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i8_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b8 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.shared.u8 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.shared.b8 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -2299,10 +2299,10 @@ define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0]; -; SM60-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i16_param_0]; +; SM60-NEXT: ld.volatile.shared.b16 %rs1, [%rd1]; ; SM60-NEXT: add.s16 %rs2, %rs1, 1; -; SM60-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; SM60-NEXT: st.volatile.shared.b16 [%rd1], %rs2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_monotonic_sys_i16( @@ -2311,10 +2311,10 @@ define void @shared_monotonic_sys_i16(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i16_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u16 %rs1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i16_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b16 %rs1, [%rd1]; ; SM70-NEXT: add.s16 %rs2, %rs1, 1; -; SM70-NEXT: st.relaxed.sys.shared.u16 [%rd1], %rs2; +; SM70-NEXT: st.relaxed.sys.shared.b16 [%rd1], %rs2; ; SM70-NEXT: ret; %a.load = load atomic i16, ptr addrspace(3) %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -2329,10 +2329,10 @@ define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0]; -; SM60-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i32_param_0]; +; SM60-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; SM60-NEXT: add.s32 %r2, %r1, 1; -; SM60-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; SM60-NEXT: st.volatile.shared.b32 [%rd1], %r2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_monotonic_sys_i32( @@ -2341,10 +2341,10 @@ define void @shared_monotonic_sys_i32(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i32_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u32 %r1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i32_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b32 %r1, [%rd1]; ; SM70-NEXT: add.s32 %r2, %r1, 1; -; SM70-NEXT: st.relaxed.sys.shared.u32 [%rd1], %r2; +; SM70-NEXT: st.relaxed.sys.shared.b32 [%rd1], %r2; ; SM70-NEXT: ret; %a.load = load atomic i32, ptr addrspace(3) %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -2358,10 +2358,10 @@ define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<4>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0]; -; SM60-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i64_param_0]; +; SM60-NEXT: ld.volatile.shared.b64 %rd2, [%rd1]; ; SM60-NEXT: add.s64 %rd3, %rd2, 1; -; SM60-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; SM60-NEXT: st.volatile.shared.b64 [%rd1], %rd3; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_monotonic_sys_i64( @@ -2369,10 +2369,10 @@ define void @shared_monotonic_sys_i64(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<4>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_i64_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.u64 %rd2, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_i64_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b64 %rd2, [%rd1]; ; SM70-NEXT: add.s64 %rd3, %rd2, 1; -; SM70-NEXT: st.relaxed.sys.shared.u64 [%rd1], %rd3; +; SM70-NEXT: st.relaxed.sys.shared.b64 [%rd1], %rd3; ; SM70-NEXT: ret; %a.load = load atomic i64, ptr addrspace(3) %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -2387,10 +2387,10 @@ define void @shared_monotonic_sys_float(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %rd<2>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0]; -; SM60-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_float_param_0]; +; SM60-NEXT: ld.volatile.shared.b32 %f1, [%rd1]; ; SM60-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM60-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; SM60-NEXT: st.volatile.shared.b32 [%rd1], %f2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_monotonic_sys_float( @@ -2399,10 +2399,10 @@ define void @shared_monotonic_sys_float(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %rd<2>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_float_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.f32 %f1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_float_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b32 %f1, [%rd1]; ; SM70-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; SM70-NEXT: st.relaxed.sys.shared.f32 [%rd1], %f2; +; SM70-NEXT: st.relaxed.sys.shared.b32 [%rd1], %f2; ; SM70-NEXT: ret; %a.load = load atomic float, ptr addrspace(3) %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -2417,10 +2417,10 @@ define void @shared_monotonic_sys_double(ptr addrspace(3) %a) { ; SM60-NEXT: .reg .b64 %fd<3>; ; SM60-EMPTY: ; SM60-NEXT: // %bb.0: -; SM60-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0]; -; SM60-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; SM60-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_double_param_0]; +; SM60-NEXT: ld.volatile.shared.b64 %fd1, [%rd1]; ; SM60-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM60-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; SM60-NEXT: st.volatile.shared.b64 [%rd1], %fd2; ; SM60-NEXT: ret; ; ; SM70-LABEL: shared_monotonic_sys_double( @@ -2429,10 +2429,10 @@ define void @shared_monotonic_sys_double(ptr addrspace(3) %a) { ; SM70-NEXT: .reg .b64 %fd<3>; ; SM70-EMPTY: ; SM70-NEXT: // %bb.0: -; SM70-NEXT: ld.param.u64 %rd1, [shared_monotonic_sys_double_param_0]; -; SM70-NEXT: ld.relaxed.sys.shared.f64 %fd1, [%rd1]; +; SM70-NEXT: ld.param.b64 %rd1, [shared_monotonic_sys_double_param_0]; +; SM70-NEXT: ld.relaxed.sys.shared.b64 %fd1, [%rd1]; ; SM70-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; SM70-NEXT: st.relaxed.sys.shared.f64 [%rd1], %fd2; +; SM70-NEXT: st.relaxed.sys.shared.b64 [%rd1], %fd2; ; SM70-NEXT: ret; %a.load = load atomic double, ptr addrspace(3) %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -2449,10 +2449,10 @@ define void @shared_monotonic_volatile_sys_i8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i8_param_0]; -; CHECK-NEXT: ld.volatile.shared.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.volatile.shared.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.u8 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.shared.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(3) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -2467,10 +2467,10 @@ define void @shared_monotonic_volatile_sys_i16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i16_param_0]; -; CHECK-NEXT: ld.volatile.shared.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.volatile.shared.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.u16 [%rd1], %rs2; +; CHECK-NEXT: st.volatile.shared.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i16, ptr addrspace(3) %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -2485,10 +2485,10 @@ define void @shared_monotonic_volatile_sys_i32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i32_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i32, ptr addrspace(3) %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -2502,10 +2502,10 @@ define void @shared_monotonic_volatile_sys_i64(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_i64_param_0]; -; CHECK-NEXT: ld.volatile.shared.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.volatile.shared.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.volatile.shared.u64 [%rd1], %rd3; +; CHECK-NEXT: st.volatile.shared.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic volatile i64, ptr addrspace(3) %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -2520,10 +2520,10 @@ define void @shared_monotonic_volatile_sys_float(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_float_param_0]; -; CHECK-NEXT: ld.volatile.shared.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.f32 [%rd1], %f2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic volatile float, ptr addrspace(3) %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -2538,10 +2538,10 @@ define void @shared_monotonic_volatile_sys_double(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_monotonic_volatile_sys_double_param_0]; -; CHECK-NEXT: ld.volatile.shared.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.volatile.shared.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.shared.f64 [%rd1], %fd2; +; CHECK-NEXT: st.volatile.shared.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic volatile double, ptr addrspace(3) %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -2560,10 +2560,10 @@ define void @local_i8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_i8_param_0]; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_i8_param_0]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: st.local.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -2578,10 +2578,10 @@ define void @local_i16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_i16_param_0]; -; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_i16_param_0]; +; CHECK-NEXT: ld.local.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: st.local.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load i16, ptr addrspace(5) %a %a.add = add i16 %a.load, 1 @@ -2596,10 +2596,10 @@ define void @local_i32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_i32_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_i32_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load i32, ptr addrspace(5) %a %a.add = add i32 %a.load, 1 @@ -2613,10 +2613,10 @@ define void @local_i64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_i64_param_0]; -; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_i64_param_0]; +; CHECK-NEXT: ld.local.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load i64, ptr addrspace(5) %a %a.add = add i64 %a.load, 1 @@ -2631,10 +2631,10 @@ define void @local_float(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_float_param_0]; -; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_float_param_0]; +; CHECK-NEXT: ld.local.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: st.local.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load float, ptr addrspace(5) %a %a.add = fadd float %a.load, 1. @@ -2649,10 +2649,10 @@ define void @local_double(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_double_param_0]; -; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_double_param_0]; +; CHECK-NEXT: ld.local.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: st.local.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load double, ptr addrspace(5) %a %a.add = fadd double %a.load, 1. @@ -2669,10 +2669,10 @@ define void @local_volatile_i8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i8_param_0]; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i8_param_0]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: st.local.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i8, ptr addrspace(5) %a %a.add = add i8 %a.load, 1 @@ -2687,10 +2687,10 @@ define void @local_volatile_i16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i16_param_0]; -; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i16_param_0]; +; CHECK-NEXT: ld.local.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: st.local.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load volatile i16, ptr addrspace(5) %a %a.add = add i16 %a.load, 1 @@ -2705,10 +2705,10 @@ define void @local_volatile_i32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i32_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i32_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile i32, ptr addrspace(5) %a %a.add = add i32 %a.load, 1 @@ -2722,10 +2722,10 @@ define void @local_volatile_i64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_i64_param_0]; -; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_i64_param_0]; +; CHECK-NEXT: ld.local.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load volatile i64, ptr addrspace(5) %a %a.add = add i64 %a.load, 1 @@ -2740,10 +2740,10 @@ define void @local_volatile_float(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_float_param_0]; -; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_float_param_0]; +; CHECK-NEXT: ld.local.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: st.local.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load volatile float, ptr addrspace(5) %a %a.add = fadd float %a.load, 1. @@ -2758,10 +2758,10 @@ define void @local_volatile_double(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_double_param_0]; -; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_double_param_0]; +; CHECK-NEXT: ld.local.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: st.local.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load volatile double, ptr addrspace(5) %a %a.add = fadd double %a.load, 1. @@ -2778,10 +2778,10 @@ define void @local_unordered_sys_i8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i8_param_0]; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_sys_i8_param_0]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: st.local.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -2796,10 +2796,10 @@ define void @local_unordered_sys_i16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i16_param_0]; -; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_sys_i16_param_0]; +; CHECK-NEXT: ld.local.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: st.local.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic i16, ptr addrspace(5) %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -2814,10 +2814,10 @@ define void @local_unordered_sys_i32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i32_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_sys_i32_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic i32, ptr addrspace(5) %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -2831,10 +2831,10 @@ define void @local_unordered_sys_i64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_i64_param_0]; -; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_sys_i64_param_0]; +; CHECK-NEXT: ld.local.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic i64, ptr addrspace(5) %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -2849,10 +2849,10 @@ define void @local_unordered_sys_float(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_float_param_0]; -; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_sys_float_param_0]; +; CHECK-NEXT: ld.local.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: st.local.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic float, ptr addrspace(5) %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -2867,10 +2867,10 @@ define void @local_unordered_sys_double(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_sys_double_param_0]; -; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_sys_double_param_0]; +; CHECK-NEXT: ld.local.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: st.local.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic double, ptr addrspace(5) %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -2887,10 +2887,10 @@ define void @local_unordered_volatile_sys_i8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i8_param_0]; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: st.local.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(5) %a unordered, align 1 %a.add = add i8 %a.load, 1 @@ -2905,10 +2905,10 @@ define void @local_unordered_volatile_sys_i16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i16_param_0]; -; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.local.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: st.local.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i16, ptr addrspace(5) %a unordered, align 2 %a.add = add i16 %a.load, 1 @@ -2923,10 +2923,10 @@ define void @local_unordered_volatile_sys_i32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i32_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i32, ptr addrspace(5) %a unordered, align 4 %a.add = add i32 %a.load, 1 @@ -2940,10 +2940,10 @@ define void @local_unordered_volatile_sys_i64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_i64_param_0]; -; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.local.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic volatile i64, ptr addrspace(5) %a unordered, align 8 %a.add = add i64 %a.load, 1 @@ -2958,10 +2958,10 @@ define void @local_unordered_volatile_sys_float(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_float_param_0]; -; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.local.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: st.local.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic volatile float, ptr addrspace(5) %a unordered, align 4 %a.add = fadd float %a.load, 1. @@ -2976,10 +2976,10 @@ define void @local_unordered_volatile_sys_double(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_unordered_volatile_sys_double_param_0]; -; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_unordered_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.local.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: st.local.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic volatile double, ptr addrspace(5) %a unordered, align 8 %a.add = fadd double %a.load, 1. @@ -2996,10 +2996,10 @@ define void @local_monotonic_sys_i8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i8_param_0]; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_sys_i8_param_0]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: st.local.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -3014,10 +3014,10 @@ define void @local_monotonic_sys_i16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i16_param_0]; -; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_sys_i16_param_0]; +; CHECK-NEXT: ld.local.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: st.local.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic i16, ptr addrspace(5) %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -3032,10 +3032,10 @@ define void @local_monotonic_sys_i32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i32_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_sys_i32_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic i32, ptr addrspace(5) %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -3049,10 +3049,10 @@ define void @local_monotonic_sys_i64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_i64_param_0]; -; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_sys_i64_param_0]; +; CHECK-NEXT: ld.local.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic i64, ptr addrspace(5) %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -3067,10 +3067,10 @@ define void @local_monotonic_sys_float(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_float_param_0]; -; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_sys_float_param_0]; +; CHECK-NEXT: ld.local.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: st.local.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic float, ptr addrspace(5) %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -3085,10 +3085,10 @@ define void @local_monotonic_sys_double(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_sys_double_param_0]; -; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_sys_double_param_0]; +; CHECK-NEXT: ld.local.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: st.local.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic double, ptr addrspace(5) %a monotonic, align 8 %a.add = fadd double %a.load, 1. @@ -3105,10 +3105,10 @@ define void @local_monotonic_volatile_sys_i8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i8_param_0]; -; CHECK-NEXT: ld.local.u8 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_sys_i8_param_0]; +; CHECK-NEXT: ld.local.b8 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u8 [%rd1], %rs2; +; CHECK-NEXT: st.local.b8 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i8, ptr addrspace(5) %a monotonic, align 1 %a.add = add i8 %a.load, 1 @@ -3123,10 +3123,10 @@ define void @local_monotonic_volatile_sys_i16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i16_param_0]; -; CHECK-NEXT: ld.local.u16 %rs1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_sys_i16_param_0]; +; CHECK-NEXT: ld.local.b16 %rs1, [%rd1]; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: st.local.u16 [%rd1], %rs2; +; CHECK-NEXT: st.local.b16 [%rd1], %rs2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i16, ptr addrspace(5) %a monotonic, align 2 %a.add = add i16 %a.load, 1 @@ -3141,10 +3141,10 @@ define void @local_monotonic_volatile_sys_i32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i32_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_sys_i32_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: add.s32 %r2, %r1, 1; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load atomic volatile i32, ptr addrspace(5) %a monotonic, align 4 %a.add = add i32 %a.load, 1 @@ -3158,10 +3158,10 @@ define void @local_monotonic_volatile_sys_i64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_i64_param_0]; -; CHECK-NEXT: ld.local.u64 %rd2, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_sys_i64_param_0]; +; CHECK-NEXT: ld.local.b64 %rd2, [%rd1]; ; CHECK-NEXT: add.s64 %rd3, %rd2, 1; -; CHECK-NEXT: st.local.u64 [%rd1], %rd3; +; CHECK-NEXT: st.local.b64 [%rd1], %rd3; ; CHECK-NEXT: ret; %a.load = load atomic volatile i64, ptr addrspace(5) %a monotonic, align 8 %a.add = add i64 %a.load, 1 @@ -3176,10 +3176,10 @@ define void @local_monotonic_volatile_sys_float(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_float_param_0]; -; CHECK-NEXT: ld.local.f32 %f1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_sys_float_param_0]; +; CHECK-NEXT: ld.local.b32 %f1, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f2, %f1, 0f3F800000; -; CHECK-NEXT: st.local.f32 [%rd1], %f2; +; CHECK-NEXT: st.local.b32 [%rd1], %f2; ; CHECK-NEXT: ret; %a.load = load atomic volatile float, ptr addrspace(5) %a monotonic, align 4 %a.add = fadd float %a.load, 1. @@ -3194,10 +3194,10 @@ define void @local_monotonic_volatile_sys_double(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_monotonic_volatile_sys_double_param_0]; -; CHECK-NEXT: ld.local.f64 %fd1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_monotonic_volatile_sys_double_param_0]; +; CHECK-NEXT: ld.local.b64 %fd1, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd2, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.f64 [%rd1], %fd2; +; CHECK-NEXT: st.local.b64 [%rd1], %fd2; ; CHECK-NEXT: ret; %a.load = load atomic volatile double, ptr addrspace(5) %a monotonic, align 8 %a.add = fadd double %a.load, 1. diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll index 74554dfcd679..f967fd1381be 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-70.ll @@ -34,40 +34,40 @@ ; CHECK-LABEL: generic_unordered_gpu define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("device") unordered, align 1 - ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("device") unordered, align 2 - ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("device") unordered, align 4 - ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("device") unordered, align 8 - ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("device") unordered, align 4 - ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("device") unordered, align 8 ret void @@ -75,40 +75,40 @@ define void @generic_unordered_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local ; CHECK-LABEL: generic_unordered_volatile_gpu define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("device") unordered, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("device") unordered, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("device") unordered, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("device") unordered, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("device") unordered, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("device") unordered, align 8 ret void @@ -116,40 +116,40 @@ define void @generic_unordered_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr ; CHECK-LABEL: generic_unordered_cta define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("block") unordered, align 1 - ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("block") unordered, align 2 - ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("block") unordered, align 4 - ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("block") unordered, align 8 - ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("block") unordered, align 4 - ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("block") unordered, align 8 ret void @@ -157,40 +157,40 @@ define void @generic_unordered_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local ; CHECK-LABEL: generic_unordered_volatile_cta define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("block") unordered, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("block") unordered, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("block") unordered, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("block") unordered, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("block") unordered, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("block") unordered, align 8 ret void @@ -198,40 +198,40 @@ define void @generic_unordered_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr ; CHECK-LABEL: generic_monotonic_gpu define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("device") monotonic, align 1 - ; CHECK: ld.relaxed.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("device") monotonic, align 2 - ; CHECK: ld.relaxed.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("device") monotonic, align 4 - ; CHECK: ld.relaxed.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("device") monotonic, align 8 - ; CHECK: ld.relaxed.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("device") monotonic, align 4 - ; CHECK: ld.relaxed.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("device") monotonic, align 8 ret void @@ -239,40 +239,40 @@ define void @generic_monotonic_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local ; CHECK-LABEL: generic_monotonic_volatile_gpu define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("device") monotonic, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("device") monotonic, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("device") monotonic, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("device") monotonic, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("device") monotonic, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("device") monotonic, align 8 ret void @@ -280,40 +280,40 @@ define void @generic_monotonic_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr ; CHECK-LABEL: generic_monotonic_cta define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("block") monotonic, align 1 - ; CHECK: ld.relaxed.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("block") monotonic, align 2 - ; CHECK: ld.relaxed.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("block") monotonic, align 4 - ; CHECK: ld.relaxed.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("block") monotonic, align 8 - ; CHECK: ld.relaxed.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("block") monotonic, align 4 - ; CHECK: ld.relaxed.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("block") monotonic, align 8 ret void @@ -321,40 +321,40 @@ define void @generic_monotonic_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local ; CHECK-LABEL: generic_monotonic_volatile_cta define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("block") monotonic, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("block") monotonic, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("block") monotonic, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("block") monotonic, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("block") monotonic, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("block") monotonic, align 8 ret void @@ -362,40 +362,40 @@ define void @generic_monotonic_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr ; CHECK-LABEL: generic_acq_rel_sys define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a release, align 1 - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b release, align 2 - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c release, align 4 - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d release, align 8 - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e release, align 4 - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e release, align 8 ret void @@ -403,40 +403,40 @@ define void @generic_acq_rel_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ; CHECK-LABEL: generic_acq_rel_volatile_sys define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a release, align 1 - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b release, align 2 - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c release, align 4 - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d release, align 8 - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e release, align 4 - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e release, align 8 ret void @@ -444,40 +444,40 @@ define void @generic_acq_rel_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e ; CHECK-LABEL: generic_acq_rel_gpu define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("device") release, align 1 - ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("device") release, align 2 - ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("device") release, align 4 - ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("device") release, align 8 - ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("device") release, align 4 - ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("device") release, align 8 ret void @@ -485,40 +485,40 @@ define void @generic_acq_rel_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ; CHECK-LABEL: generic_acq_rel_volatile_gpu define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("device") release, align 1 - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("device") release, align 2 - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("device") release, align 4 - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("device") release, align 8 - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("device") release, align 4 - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("device") release, align 8 ret void @@ -526,40 +526,40 @@ define void @generic_acq_rel_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e ; CHECK-LABEL: generic_acq_rel_cta define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("block") release, align 1 - ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("block") release, align 2 - ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("block") release, align 4 - ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("block") release, align 8 - ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("block") release, align 4 - ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("block") release, align 8 ret void @@ -567,40 +567,40 @@ define void @generic_acq_rel_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_u ; CHECK-LABEL: generic_acq_rel_volatile_cta define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("block") release, align 1 - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("block") release, align 2 - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("block") release, align 4 - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("block") release, align 8 - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("block") release, align 4 - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("block") release, align 8 ret void @@ -609,51 +609,51 @@ define void @generic_acq_rel_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e ; CHECK-LABEL: generic_sc_sys define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e seq_cst, align 8 ret void @@ -662,51 +662,51 @@ define void @generic_sc_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname ; CHECK-LABEL: generic_sc_volatile_sys define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e seq_cst, align 8 ret void @@ -715,51 +715,51 @@ define void @generic_sc_volatile_sys(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc ; CHECK-LABEL: generic_sc_gpu define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("device") seq_cst, align 1 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("device") seq_cst, align 2 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("device") seq_cst, align 8 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.gpu.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.gpu.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("device") seq_cst, align 8 ret void @@ -768,51 +768,51 @@ define void @generic_sc_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname ; CHECK-LABEL: generic_sc_volatile_gpu define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("device") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("device") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("device") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("device") seq_cst, align 8 ret void @@ -821,51 +821,51 @@ define void @generic_sc_volatile_gpu(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc ; CHECK-LABEL: generic_sc_cta define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cta.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cta.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("block") seq_cst, align 8 ret void @@ -874,51 +874,51 @@ define void @generic_sc_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unname ; CHECK-LABEL: generic_sc_volatile_cta define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("block") seq_cst, align 8 ret void @@ -928,40 +928,40 @@ define void @generic_sc_volatile_cta(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc ; CHECK-LABEL: global_unordered_gpu define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1 - ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2 - ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4 - ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8 - ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4 - ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8 ret void @@ -969,40 +969,40 @@ define void @global_unordered_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ; CHECK-LABEL: global_unordered_volatile_gpu define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") unordered, align 1 - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") unordered, align 2 - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") unordered, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") unordered, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") unordered, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") unordered, align 8 ret void @@ -1010,40 +1010,40 @@ define void @global_unordered_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) ; CHECK-LABEL: global_unordered_cta define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1 - ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2 - ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4 - ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8 - ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4 - ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8 ret void @@ -1051,40 +1051,40 @@ define void @global_unordered_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ; CHECK-LABEL: global_unordered_volatile_cta define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") unordered, align 1 - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") unordered, align 2 - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") unordered, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") unordered, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") unordered, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") unordered, align 8 ret void @@ -1092,40 +1092,40 @@ define void @global_unordered_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) ; CHECK-LABEL: global_monotonic_gpu define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1 - ; CHECK: ld.relaxed.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2 - ; CHECK: ld.relaxed.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4 - ; CHECK: ld.relaxed.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8 - ; CHECK: ld.relaxed.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4 - ; CHECK: ld.relaxed.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8 ret void @@ -1133,40 +1133,40 @@ define void @global_monotonic_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ; CHECK-LABEL: global_monotonic_volatile_gpu define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") monotonic, align 1 - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") monotonic, align 2 - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") monotonic, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") monotonic, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") monotonic, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") monotonic, align 8 ret void @@ -1174,40 +1174,40 @@ define void @global_monotonic_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) ; CHECK-LABEL: global_monotonic_cta define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1 - ; CHECK: ld.relaxed.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2 - ; CHECK: ld.relaxed.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4 - ; CHECK: ld.relaxed.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8 - ; CHECK: ld.relaxed.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4 - ; CHECK: ld.relaxed.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8 ret void @@ -1215,40 +1215,40 @@ define void @global_monotonic_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ; CHECK-LABEL: global_monotonic_volatile_cta define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.mmio.relaxed.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") monotonic, align 1 - ; CHECK: ld.mmio.relaxed.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") monotonic, align 2 - ; CHECK: ld.mmio.relaxed.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") monotonic, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.mmio.relaxed.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") monotonic, align 8 - ; CHECK: ld.mmio.relaxed.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") monotonic, align 4 - ; CHECK: ld.mmio.relaxed.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.mmio.relaxed.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.mmio.relaxed.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.mmio.relaxed.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") monotonic, align 8 ret void @@ -1256,40 +1256,40 @@ define void @global_monotonic_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) ; CHECK-LABEL: global_acq_rel_sys define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a release, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b release, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c release, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d release, align 8 - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e release, align 4 - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e release, align 8 ret void @@ -1297,40 +1297,40 @@ define void @global_acq_rel_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad ; CHECK-LABEL: global_acq_rel_volatile_sys define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a release, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b release, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c release, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d release, align 8 - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e release, align 4 - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e release, align 8 ret void @@ -1338,40 +1338,40 @@ define void @global_acq_rel_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: global_acq_rel_gpu define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1 - ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2 - ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4 - ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8 - ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4 - ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8 ret void @@ -1379,40 +1379,40 @@ define void @global_acq_rel_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad ; CHECK-LABEL: global_acq_rel_volatile_gpu define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") release, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") release, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") release, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") release, align 8 - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") release, align 4 - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") release, align 8 ret void @@ -1420,40 +1420,40 @@ define void @global_acq_rel_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: global_acq_rel_cta define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1 - ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2 - ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4 - ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8 - ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4 - ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8 ret void @@ -1461,40 +1461,40 @@ define void @global_acq_rel_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad ; CHECK-LABEL: global_acq_rel_volatile_cta define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") release, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") release, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") release, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") release, align 8 - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") release, align 4 - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") release, align 8 ret void @@ -1503,51 +1503,51 @@ define void @global_acq_rel_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: global_seq_cst_sys define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e seq_cst, align 8 ret void @@ -1556,51 +1556,51 @@ define void @global_seq_cst_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad ; CHECK-LABEL: global_seq_cst_volatile_sys define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e seq_cst, align 8 ret void @@ -1609,51 +1609,51 @@ define void @global_seq_cst_volatile_sys(ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: global_seq_cst_gpu define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.gpu.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.gpu.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 ret void @@ -1662,51 +1662,51 @@ define void @global_seq_cst_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad ; CHECK-LABEL: global_seq_cst_volatile_gpu define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("device") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("device") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("device") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("device") seq_cst, align 8 ret void @@ -1715,51 +1715,51 @@ define void @global_seq_cst_volatile_gpu(ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: global_seq_cst_cta define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cta.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cta.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 ret void @@ -1768,51 +1768,51 @@ define void @global_seq_cst_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr ad ; CHECK-LABEL: global_seq_cst_volatile_cta define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("block") seq_cst, align 8 ret void @@ -1822,40 +1822,40 @@ define void @global_seq_cst_volatile_cta(ptr addrspace(1) %a, ptr addrspace(1) % ; CHECK-LABEL: shared_unordered_gpu define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1 - ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2 - ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4 - ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8 - ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4 - ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8 ret void @@ -1863,40 +1863,40 @@ define void @shared_unordered_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ; CHECK-LABEL: shared_unordered_volatile_gpu define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") unordered, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") unordered, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") unordered, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") unordered, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") unordered, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") unordered, align 8 ret void @@ -1904,40 +1904,40 @@ define void @shared_unordered_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) ; CHECK-LABEL: shared_unordered_cta define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1 - ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2 - ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4 - ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8 - ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4 - ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8 ret void @@ -1945,40 +1945,40 @@ define void @shared_unordered_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ; CHECK-LABEL: shared_unordered_volatile_cta define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") unordered, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") unordered, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") unordered, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") unordered, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") unordered, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") unordered, align 8 ret void @@ -1986,40 +1986,40 @@ define void @shared_unordered_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) ; CHECK-LABEL: shared_monotonic_gpu define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1 - ; CHECK: ld.relaxed.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2 - ; CHECK: ld.relaxed.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4 - ; CHECK: ld.relaxed.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8 - ; CHECK: ld.relaxed.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4 - ; CHECK: ld.relaxed.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8 ret void @@ -2027,40 +2027,40 @@ define void @shared_monotonic_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ; CHECK-LABEL: shared_monotonic_volatile_gpu define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") monotonic, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") monotonic, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") monotonic, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") monotonic, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") monotonic, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") monotonic, align 8 ret void @@ -2068,40 +2068,40 @@ define void @shared_monotonic_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) ; CHECK-LABEL: shared_monotonic_cta define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1 - ; CHECK: ld.relaxed.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2 - ; CHECK: ld.relaxed.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4 - ; CHECK: ld.relaxed.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8 - ; CHECK: ld.relaxed.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4 - ; CHECK: ld.relaxed.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8 ret void @@ -2109,40 +2109,40 @@ define void @shared_monotonic_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ; CHECK-LABEL: shared_monotonic_volatile_cta define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") monotonic, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") monotonic, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") monotonic, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") monotonic, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") monotonic, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") monotonic, align 8 ret void @@ -2150,40 +2150,40 @@ define void @shared_monotonic_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) ; CHECK-LABEL: shared_acq_rel_sys define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a release, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b release, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c release, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d release, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e release, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e release, align 8 ret void @@ -2191,40 +2191,40 @@ define void @shared_acq_rel_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad ; CHECK-LABEL: shared_acq_rel_volatile_sys define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a release, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b release, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c release, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d release, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e release, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e release, align 8 ret void @@ -2232,40 +2232,40 @@ define void @shared_acq_rel_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: shared_acq_rel_gpu define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1 - ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2 - ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4 - ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8 - ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4 - ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8 ret void @@ -2273,40 +2273,40 @@ define void @shared_acq_rel_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad ; CHECK-LABEL: shared_acq_rel_volatile_gpu define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") release, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") release, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") release, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") release, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") release, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") release, align 8 ret void @@ -2314,40 +2314,40 @@ define void @shared_acq_rel_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: shared_acq_rel_cta define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1 - ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2 - ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4 - ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8 - ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4 - ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8 ret void @@ -2355,40 +2355,40 @@ define void @shared_acq_rel_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad ; CHECK-LABEL: shared_acq_rel_volatile_cta define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") release, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") release, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") release, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") release, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") release, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") release, align 8 ret void @@ -2397,51 +2397,51 @@ define void @shared_acq_rel_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: shared_seq_cst_sys define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void @@ -2450,51 +2450,51 @@ define void @shared_seq_cst_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad ; CHECK-LABEL: shared_seq_cst_volatile_sys define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e seq_cst, align 8 ret void @@ -2503,51 +2503,51 @@ define void @shared_seq_cst_volatile_sys(ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: shared_seq_cst_gpu define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.gpu - ; CHECK: ld.acquire.gpu.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.gpu.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.gpu - ; CHECK: st.release.gpu.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.gpu.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 ret void @@ -2556,51 +2556,51 @@ define void @shared_seq_cst_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad ; CHECK-LABEL: shared_seq_cst_volatile_gpu define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("device") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("device") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("device") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("device") seq_cst, align 8 ret void @@ -2609,51 +2609,51 @@ define void @shared_seq_cst_volatile_gpu(ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: shared_seq_cst_cta define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cta.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cta.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.cta - ; CHECK: ld.acquire.cta.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cta.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.cta - ; CHECK: st.release.cta.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cta.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 ret void @@ -2662,51 +2662,51 @@ define void @shared_seq_cst_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr ad ; CHECK-LABEL: shared_seq_cst_volatile_cta define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("block") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("block") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("block") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("block") seq_cst, align 8 ret void @@ -2716,40 +2716,40 @@ define void @shared_seq_cst_volatile_cta(ptr addrspace(3) %a, ptr addrspace(3) % ; CHECK-LABEL: local_unordered_gpu define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8 ret void @@ -2757,40 +2757,40 @@ define void @local_unordered_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a ; CHECK-LABEL: local_unordered_volatile_gpu define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") unordered, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") unordered, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") unordered, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") unordered, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") unordered, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") unordered, align 8 ret void @@ -2798,40 +2798,40 @@ define void @local_unordered_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) ; CHECK-LABEL: local_unordered_cta define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8 ret void @@ -2839,40 +2839,40 @@ define void @local_unordered_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a ; CHECK-LABEL: local_unordered_volatile_cta define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") unordered, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") unordered, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") unordered, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") unordered, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") unordered, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") unordered, align 8 ret void @@ -2880,40 +2880,40 @@ define void @local_unordered_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) ; CHECK-LABEL: local_monotonic_gpu define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8 ret void @@ -2921,40 +2921,40 @@ define void @local_monotonic_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a ; CHECK-LABEL: local_monotonic_volatile_gpu define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") monotonic, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") monotonic, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") monotonic, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") monotonic, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") monotonic, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") monotonic, align 8 ret void @@ -2962,40 +2962,40 @@ define void @local_monotonic_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) ; CHECK-LABEL: local_monotonic_cta define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8 ret void @@ -3003,40 +3003,40 @@ define void @local_monotonic_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr a ; CHECK-LABEL: local_monotonic_volatile_cta define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") monotonic, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") monotonic, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") monotonic, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") monotonic, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") monotonic, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") monotonic, align 8 ret void @@ -3044,40 +3044,40 @@ define void @local_monotonic_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) ; CHECK-LABEL: local_acq_rel_sys define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e release, align 8 ret void @@ -3085,40 +3085,40 @@ define void @local_acq_rel_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add ; CHECK-LABEL: local_acq_rel_volatile_sys define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e release, align 8 ret void @@ -3126,40 +3126,40 @@ define void @local_acq_rel_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b ; CHECK-LABEL: local_acq_rel_gpu define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8 ret void @@ -3167,40 +3167,40 @@ define void @local_acq_rel_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add ; CHECK-LABEL: local_acq_rel_volatile_gpu define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") release, align 8 ret void @@ -3208,40 +3208,40 @@ define void @local_acq_rel_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b ; CHECK-LABEL: local_acq_rel_cta define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8 ret void @@ -3249,40 +3249,40 @@ define void @local_acq_rel_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add ; CHECK-LABEL: local_acq_rel_volatile_cta define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") release, align 8 ret void @@ -3290,40 +3290,40 @@ define void @local_acq_rel_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b ; CHECK-LABEL: local_seq_cst_sys define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e seq_cst, align 8 ret void @@ -3331,40 +3331,40 @@ define void @local_seq_cst_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add ; CHECK-LABEL: local_seq_cst_volatile_sys define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e seq_cst, align 8 ret void @@ -3372,40 +3372,40 @@ define void @local_seq_cst_volatile_sys(ptr addrspace(5) %a, ptr addrspace(5) %b ; CHECK-LABEL: local_seq_cst_gpu define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 ret void @@ -3413,40 +3413,40 @@ define void @local_seq_cst_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add ; CHECK-LABEL: local_seq_cst_volatile_gpu define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("device") seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("device") seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("device") seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("device") seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("device") seq_cst, align 8 ret void @@ -3454,40 +3454,40 @@ define void @local_seq_cst_volatile_gpu(ptr addrspace(5) %a, ptr addrspace(5) %b ; CHECK-LABEL: local_seq_cst_cta define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 ret void @@ -3495,40 +3495,40 @@ define void @local_seq_cst_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr add ; CHECK-LABEL: local_seq_cst_volatile_cta define void @local_seq_cst_volatile_cta(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("block") seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("block") seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("block") seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("block") seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("block") seq_cst, align 8 ret void diff --git a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll index 345b55eb65bd..ae559f50d498 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-sm-90.ll @@ -34,40 +34,40 @@ ; CHECK-LABEL: generic_unordered_cluster define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("cluster") unordered, align 8 ret void @@ -75,40 +75,40 @@ define void @generic_unordered_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) l ; CHECK-LABEL: generic_unordered_volatile_cluster define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("cluster") unordered, align 8 ret void @@ -116,40 +116,40 @@ define void @generic_unordered_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ; CHECK-LABEL: generic_monotonic_cluster define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.relaxed.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.relaxed.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.relaxed.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.relaxed.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.relaxed.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("cluster") monotonic, align 8 ret void @@ -157,40 +157,40 @@ define void @generic_monotonic_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) l ; CHECK-LABEL: generic_monotonic_volatile_cluster define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.volatile.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.volatile.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.volatile.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.volatile.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.volatile.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.volatile.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("cluster") monotonic, align 8 ret void @@ -198,40 +198,40 @@ define void @generic_monotonic_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ; CHECK-LABEL: generic_acq_rel_cluster define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("cluster") release, align 1 - ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("cluster") release, align 2 - ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("cluster") release, align 8 - ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("cluster") release, align 8 ret void @@ -239,40 +239,40 @@ define void @generic_acq_rel_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) loc ; CHECK-LABEL: generic_acq_rel_volatile_cluster define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("cluster") release, align 1 - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("cluster") release, align 2 - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("cluster") release, align 8 - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("cluster") release, align 8 ret void @@ -281,51 +281,51 @@ define void @generic_acq_rel_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, pt ; CHECK-LABEL: generic_sc_cluster define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cluster.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr %e syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cluster.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr %e syncscope("cluster") seq_cst, align 8 ret void @@ -334,51 +334,51 @@ define void @generic_sc_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_un ; CHECK-LABEL: generic_sc_volatile_cluster define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr %a syncscope("cluster") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr %b syncscope("cluster") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr %c syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr %d syncscope("cluster") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr %e syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr %e syncscope("cluster") seq_cst, align 8 ret void @@ -388,40 +388,40 @@ define void @generic_sc_volatile_cluster(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e) ; CHECK-LABEL: global_unordered_cluster define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 ret void @@ -429,40 +429,40 @@ define void @global_unordered_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ; CHECK-LABEL: global_unordered_volatile_cluster define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") unordered, align 8 ret void @@ -470,40 +470,40 @@ define void @global_unordered_volatile_cluster(ptr addrspace(1) %a, ptr addrspac ; CHECK-LABEL: global_monotonic_cluster define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.relaxed.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.relaxed.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.relaxed.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.relaxed.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.relaxed.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 ret void @@ -511,40 +511,40 @@ define void @global_monotonic_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ; CHECK-LABEL: global_monotonic_volatile_cluster define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.volatile.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.volatile.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.volatile.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.volatile.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.volatile.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.volatile.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") monotonic, align 8 ret void @@ -552,40 +552,40 @@ define void @global_monotonic_volatile_cluster(ptr addrspace(1) %a, ptr addrspac ; CHECK-LABEL: global_acq_rel_cluster define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1 - ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2 - ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8 - ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8 ret void @@ -593,40 +593,40 @@ define void @global_acq_rel_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, pt ; CHECK-LABEL: global_acq_rel_volatile_cluster define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") release, align 1 - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") release, align 2 - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") release, align 8 - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") release, align 8 ret void @@ -635,51 +635,51 @@ define void @global_acq_rel_volatile_cluster(ptr addrspace(1) %a, ptr addrspace( ; CHECK-LABEL: global_seq_cst_cluster define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cluster.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cluster.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 ret void @@ -688,51 +688,51 @@ define void @global_seq_cst_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, pt ; CHECK-LABEL: global_seq_cst_volatile_cluster define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %d, ptr addrspace(1) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(1) %a syncscope("cluster") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(1) %b syncscope("cluster") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(1) %c syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(1) %d syncscope("cluster") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.global.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.global.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(1) %e syncscope("cluster") seq_cst, align 8 ret void @@ -742,40 +742,40 @@ define void @global_seq_cst_volatile_cluster(ptr addrspace(1) %a, ptr addrspace( ; CHECK-LABEL: shared_unordered_cluster define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 ret void @@ -783,40 +783,40 @@ define void @shared_unordered_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK-LABEL: shared_unordered_volatile_cluster define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") unordered, align 8 ret void @@ -824,40 +824,40 @@ define void @shared_unordered_volatile_cluster(ptr addrspace(3) %a, ptr addrspac ; CHECK-LABEL: shared_monotonic_cluster define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.relaxed.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.relaxed.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.relaxed.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.relaxed.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.relaxed.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.relaxed.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.relaxed.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.relaxed.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.relaxed.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.relaxed.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.relaxed.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.relaxed.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.relaxed.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.relaxed.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 ret void @@ -865,40 +865,40 @@ define void @shared_monotonic_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ; CHECK-LABEL: shared_monotonic_volatile_cluster define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.volatile.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.volatile.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.volatile.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.volatile.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.volatile.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.volatile.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.volatile.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.volatile.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.volatile.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.volatile.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.volatile.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.volatile.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.volatile.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.volatile.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.volatile.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.volatile.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") monotonic, align 8 ret void @@ -906,40 +906,40 @@ define void @shared_monotonic_volatile_cluster(ptr addrspace(3) %a, ptr addrspac ; CHECK-LABEL: shared_acq_rel_cluster define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1 - ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2 - ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8 - ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8 ret void @@ -947,40 +947,40 @@ define void @shared_acq_rel_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, pt ; CHECK-LABEL: shared_acq_rel_volatile_cluster define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") release, align 1 - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") release, align 2 - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") release, align 8 - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") release, align 4 - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") release, align 8 ret void @@ -989,51 +989,51 @@ define void @shared_acq_rel_volatile_cluster(ptr addrspace(3) %a, ptr addrspace( ; CHECK-LABEL: shared_seq_cst_cluster define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.cluster - ; CHECK: ld.acquire.cluster.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.cluster.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.cluster - ; CHECK: st.release.cluster.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.cluster.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 ret void @@ -1042,51 +1042,51 @@ define void @shared_seq_cst_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, pt ; CHECK-LABEL: shared_seq_cst_volatile_cluster define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace(3) %b, ptr addrspace(3) %c, ptr addrspace(3) %d, ptr addrspace(3) %e) local_unnamed_addr { ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(3) %a syncscope("cluster") seq_cst, align 1 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.release.sys.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(3) %b syncscope("cluster") seq_cst, align 2 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(3) %c syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(3) %d syncscope("cluster") seq_cst, align 8 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.release.sys.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 4 ; CHECK: fence.sc.sys - ; CHECK: ld.acquire.sys.shared.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.acquire.sys.shared.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. ; CHECK: fence.sc.sys - ; CHECK: st.release.sys.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.release.sys.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(3) %e syncscope("cluster") seq_cst, align 8 ret void @@ -1096,40 +1096,40 @@ define void @shared_seq_cst_volatile_cluster(ptr addrspace(3) %a, ptr addrspace( ; CHECK-LABEL: local_unordered_cluster define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 ret void @@ -1137,40 +1137,40 @@ define void @local_unordered_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, p ; CHECK-LABEL: local_unordered_volatile_cluster define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") unordered, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") unordered, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") unordered, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") unordered, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") unordered, align 8 ret void @@ -1178,40 +1178,40 @@ define void @local_unordered_volatile_cluster(ptr addrspace(5) %a, ptr addrspace ; CHECK-LABEL: local_monotonic_cluster define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 ret void @@ -1219,40 +1219,40 @@ define void @local_monotonic_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, p ; CHECK-LABEL: local_monotonic_volatile_cluster define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") monotonic, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") monotonic, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") monotonic, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") monotonic, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") monotonic, align 8 ret void @@ -1260,40 +1260,40 @@ define void @local_monotonic_volatile_cluster(ptr addrspace(5) %a, ptr addrspace ; CHECK-LABEL: local_acq_rel_cluster define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8 ret void @@ -1301,40 +1301,40 @@ define void @local_acq_rel_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr ; CHECK-LABEL: local_acq_rel_volatile_cluster define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") acquire, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") release, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") acquire, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") release, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") acquire, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") release, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") acquire, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") release, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") acquire, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") release, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") acquire, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") release, align 8 ret void @@ -1342,40 +1342,40 @@ define void @local_acq_rel_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5 ; CHECK-LABEL: local_seq_cst_cluster define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 ret void @@ -1383,40 +1383,40 @@ define void @local_seq_cst_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr ; CHECK-LABEL: local_seq_cst_volatile_cluster define void @local_seq_cst_volatile_cluster(ptr addrspace(5) %a, ptr addrspace(5) %b, ptr addrspace(5) %c, ptr addrspace(5) %d, ptr addrspace(5) %e) local_unnamed_addr { - ; CHECK: ld.local.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %a.load = load atomic volatile i8, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 %a.add = add i8 %a.load, 1 - ; CHECK: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i8 %a.add, ptr addrspace(5) %a syncscope("cluster") seq_cst, align 1 - ; CHECK: ld.local.u16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b16 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] %b.load = load atomic volatile i16, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 %b.add = add i16 %b.load, 1 - ; CHECK: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} + ; CHECK: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} store atomic volatile i16 %b.add, ptr addrspace(5) %b syncscope("cluster") seq_cst, align 2 - ; CHECK: ld.local.u32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %r{{[0-9]+}}, [%rd{{[0-9]+}}] %c.load = load atomic volatile i32, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 %c.add = add i32 %c.load, 1 - ; CHECK: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} store atomic volatile i32 %c.add, ptr addrspace(5) %c syncscope("cluster") seq_cst, align 4 - ; CHECK: ld.local.u64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %rd{{[0-9]+}}, [%rd{{[0-9]+}}] %d.load = load atomic volatile i64, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 %d.add = add i64 %d.load, 1 - ; CHECK: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} store atomic volatile i64 %d.add, ptr addrspace(5) %d syncscope("cluster") seq_cst, align 8 - ; CHECK: ld.local.f32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b32 %f{{[0-9]+}}, [%rd{{[0-9]+}}] %e.load = load atomic volatile float, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 %e.add = fadd float %e.load, 1. - ; CHECK: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} + ; CHECK: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} store atomic volatile float %e.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 4 - ; CHECK: ld.local.f64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] + ; CHECK: ld.local.b64 %fd{{[0-9]+}}, [%rd{{[0-9]+}}] %f.load = load atomic volatile double, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 %f.add = fadd double %f.load, 1. - ; CHECK: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} + ; CHECK: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} store atomic volatile double %f.add, ptr addrspace(5) %e syncscope("cluster") seq_cst, align 8 ret void diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll index 3215fce96400..2b5553a77fe9 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll @@ -31,11 +31,11 @@ define void @generic_2xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; -; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_2xi8_param_0]; +; CHECK-NEXT: ld.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load <2 x i8>, ptr %a %a.add = add <2 x i8> %a.load, @@ -54,8 +54,8 @@ define void @generic_4xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -75,7 +75,7 @@ define void @generic_4xi8(ptr %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.u32 [%rd1], %r12; +; CHECK-NEXT: st.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr %a %a.add = add <4 x i8> %a.load, @@ -91,7 +91,7 @@ define void @generic_8xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi8_param_0]; ; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -147,7 +147,7 @@ define void @generic_16xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xi8_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -241,13 +241,13 @@ define void @generic_2xi16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_2xi16_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: st.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load <2 x i16>, ptr %a %a.add = add <2 x i16> %a.load, @@ -262,13 +262,13 @@ define void @generic_4xi16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; -; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi16_param_0]; +; CHECK-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load <4 x i16>, ptr %a %a.add = add <4 x i16> %a.load, @@ -284,7 +284,7 @@ define void @generic_8xi16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi16_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -317,11 +317,11 @@ define void @generic_2xi32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; -; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_2xi32_param_0]; +; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load <2 x i32>, ptr %a %a.add = add <2 x i32> %a.load, @@ -336,13 +336,13 @@ define void @generic_4xi32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; -; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi32_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load <4 x i32>, ptr %a %a.add = add <4 x i32> %a.load, @@ -356,11 +356,11 @@ define void @generic_2xi64(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; -; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_2xi64_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load <2 x i64>, ptr %a %a.add = add <2 x i64> %a.load, @@ -375,11 +375,11 @@ define void @generic_2xfloat(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; -; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_2xfloat_param_0]; +; CHECK-NEXT: ld.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load <2 x float>, ptr %a %a.add = fadd <2 x float> %a.load, @@ -394,13 +394,13 @@ define void @generic_4xfloat(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; -; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xfloat_param_0]; +; CHECK-NEXT: ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load <4 x float>, ptr %a %a.add = fadd <4 x float> %a.load, @@ -415,11 +415,11 @@ define void @generic_2xdouble(ptr %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; -; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_2xdouble_param_0]; +; CHECK-NEXT: ld.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load <2 x double>, ptr %a %a.add = fadd <2 x double> %a.load, @@ -453,11 +453,11 @@ define void @generic_volatile_2xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i8>, ptr %a %a.add = add <2 x i8> %a.load, @@ -475,8 +475,8 @@ define void @generic_volatile_4xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -496,7 +496,7 @@ define void @generic_volatile_4xi8(ptr %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; +; CHECK-NEXT: st.volatile.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr %a %a.add = add <4 x i8> %a.load, @@ -512,7 +512,7 @@ define void @generic_volatile_8xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi8_param_0]; ; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -568,7 +568,7 @@ define void @generic_volatile_16xi8(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xi8_param_0]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -662,13 +662,13 @@ define void @generic_volatile_2xi16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i16>, ptr %a %a.add = add <2 x i16> %a.load, @@ -683,13 +683,13 @@ define void @generic_volatile_4xi16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.volatile.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i16>, ptr %a %a.add = add <4 x i16> %a.load, @@ -705,7 +705,7 @@ define void @generic_volatile_8xi16(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi16_param_0]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -738,11 +738,11 @@ define void @generic_volatile_2xi32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i32>, ptr %a %a.add = add <2 x i32> %a.load, @@ -757,13 +757,13 @@ define void @generic_volatile_4xi32(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i32>, ptr %a %a.add = add <4 x i32> %a.load, @@ -777,11 +777,11 @@ define void @generic_volatile_2xi64(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.volatile.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i64>, ptr %a %a.add = add <2 x i64> %a.load, @@ -796,11 +796,11 @@ define void @generic_volatile_2xfloat(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x float>, ptr %a %a.add = fadd <2 x float> %a.load, @@ -815,13 +815,13 @@ define void @generic_volatile_4xfloat(ptr %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x float>, ptr %a %a.add = fadd <4 x float> %a.load, @@ -836,11 +836,11 @@ define void @generic_volatile_2xdouble(ptr %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.volatile.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x double>, ptr %a %a.add = fadd <2 x double> %a.load, @@ -859,11 +859,11 @@ define void @global_2xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; -; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi8_param_0]; +; CHECK-NEXT: ld.global.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.global.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load <2 x i8>, ptr addrspace(1) %a %a.add = add <2 x i8> %a.load, @@ -879,8 +879,8 @@ define void @global_4xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; -; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi8_param_0]; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -900,7 +900,7 @@ define void @global_4xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.global.u32 [%rd1], %r12; +; CHECK-NEXT: st.global.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr addrspace(1) %a %a.add = add <4 x i8> %a.load, @@ -916,7 +916,7 @@ define void @global_8xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_8xi8_param_0]; ; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -972,7 +972,7 @@ define void @global_16xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_16xi8_param_0]; ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -1066,13 +1066,13 @@ define void @global_2xi16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; -; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi16_param_0]; +; CHECK-NEXT: ld.global.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: st.global.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load <2 x i16>, ptr addrspace(1) %a %a.add = add <2 x i16> %a.load, @@ -1087,13 +1087,13 @@ define void @global_4xi16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; -; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi16_param_0]; +; CHECK-NEXT: ld.global.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.global.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load <4 x i16>, ptr addrspace(1) %a %a.add = add <4 x i16> %a.load, @@ -1109,7 +1109,7 @@ define void @global_8xi16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_8xi16_param_0]; ; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -1142,11 +1142,11 @@ define void @global_2xi32(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; -; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi32_param_0]; +; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load <2 x i32>, ptr addrspace(1) %a %a.add = add <2 x i32> %a.load, @@ -1161,13 +1161,13 @@ define void @global_4xi32(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; -; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xi32_param_0]; +; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load <4 x i32>, ptr addrspace(1) %a %a.add = add <4 x i32> %a.load, @@ -1181,11 +1181,11 @@ define void @global_2xi64(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; -; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xi64_param_0]; +; CHECK-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.global.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load <2 x i64>, ptr addrspace(1) %a %a.add = add <2 x i64> %a.load, @@ -1200,11 +1200,11 @@ define void @global_2xfloat(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; -; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xfloat_param_0]; +; CHECK-NEXT: ld.global.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.global.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load <2 x float>, ptr addrspace(1) %a %a.add = fadd <2 x float> %a.load, @@ -1219,13 +1219,13 @@ define void @global_4xfloat(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; -; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_4xfloat_param_0]; +; CHECK-NEXT: ld.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.global.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load <4 x float>, ptr addrspace(1) %a %a.add = fadd <4 x float> %a.load, @@ -1240,11 +1240,11 @@ define void @global_2xdouble(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; -; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_2xdouble_param_0]; +; CHECK-NEXT: ld.global.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.global.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load <2 x double>, ptr addrspace(1) %a %a.add = fadd <2 x double> %a.load, @@ -1261,11 +1261,11 @@ define void @global_volatile_2xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.global.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i8>, ptr addrspace(1) %a %a.add = add <2 x i8> %a.load, @@ -1281,8 +1281,8 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -1302,7 +1302,7 @@ define void @global_volatile_4xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; +; CHECK-NEXT: st.volatile.global.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr addrspace(1) %a %a.add = add <4 x i8> %a.load, @@ -1318,7 +1318,7 @@ define void @global_volatile_8xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_8xi8_param_0]; ; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -1374,7 +1374,7 @@ define void @global_volatile_16xi8(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_16xi8_param_0]; ; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -1468,13 +1468,13 @@ define void @global_volatile_2xi16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.global.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i16>, ptr addrspace(1) %a %a.add = add <2 x i16> %a.load, @@ -1489,13 +1489,13 @@ define void @global_volatile_4xi16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.volatile.global.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i16>, ptr addrspace(1) %a %a.add = add <4 x i16> %a.load, @@ -1511,7 +1511,7 @@ define void @global_volatile_8xi16(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_8xi16_param_0]; ; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -1544,11 +1544,11 @@ define void @global_volatile_2xi32(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i32>, ptr addrspace(1) %a %a.add = add <2 x i32> %a.load, @@ -1563,13 +1563,13 @@ define void @global_volatile_4xi32(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i32>, ptr addrspace(1) %a %a.add = add <4 x i32> %a.load, @@ -1583,11 +1583,11 @@ define void @global_volatile_2xi64(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.volatile.global.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i64>, ptr addrspace(1) %a %a.add = add <2 x i64> %a.load, @@ -1602,11 +1602,11 @@ define void @global_volatile_2xfloat(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x float>, ptr addrspace(1) %a %a.add = fadd <2 x float> %a.load, @@ -1621,13 +1621,13 @@ define void @global_volatile_4xfloat(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x float>, ptr addrspace(1) %a %a.add = fadd <4 x float> %a.load, @@ -1642,11 +1642,11 @@ define void @global_volatile_2xdouble(ptr addrspace(1) %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [global_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.volatile.global.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x double>, ptr addrspace(1) %a %a.add = fadd <2 x double> %a.load, @@ -1665,11 +1665,11 @@ define void @shared_2xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; -; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_2xi8_param_0]; +; CHECK-NEXT: ld.shared.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.shared.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load <2 x i8>, ptr addrspace(3) %a %a.add = add <2 x i8> %a.load, @@ -1685,8 +1685,8 @@ define void @shared_4xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; -; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xi8_param_0]; +; CHECK-NEXT: ld.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -1706,7 +1706,7 @@ define void @shared_4xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.shared.u32 [%rd1], %r12; +; CHECK-NEXT: st.shared.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr addrspace(3) %a %a.add = add <4 x i8> %a.load, @@ -1722,7 +1722,7 @@ define void @shared_8xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi8_param_0]; ; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -1778,7 +1778,7 @@ define void @shared_16xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xi8_param_0]; ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -1872,13 +1872,13 @@ define void @shared_2xi16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; -; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_2xi16_param_0]; +; CHECK-NEXT: ld.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: st.shared.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load <2 x i16>, ptr addrspace(3) %a %a.add = add <2 x i16> %a.load, @@ -1893,13 +1893,13 @@ define void @shared_4xi16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; -; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xi16_param_0]; +; CHECK-NEXT: ld.shared.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.shared.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load <4 x i16>, ptr addrspace(3) %a %a.add = add <4 x i16> %a.load, @@ -1915,7 +1915,7 @@ define void @shared_8xi16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi16_param_0]; ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -1948,11 +1948,11 @@ define void @shared_2xi32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; -; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_2xi32_param_0]; +; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load <2 x i32>, ptr addrspace(3) %a %a.add = add <2 x i32> %a.load, @@ -1967,13 +1967,13 @@ define void @shared_4xi32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; -; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xi32_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load <4 x i32>, ptr addrspace(3) %a %a.add = add <4 x i32> %a.load, @@ -1987,11 +1987,11 @@ define void @shared_2xi64(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; -; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_2xi64_param_0]; +; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load <2 x i64>, ptr addrspace(3) %a %a.add = add <2 x i64> %a.load, @@ -2006,11 +2006,11 @@ define void @shared_2xfloat(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; -; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_2xfloat_param_0]; +; CHECK-NEXT: ld.shared.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load <2 x float>, ptr addrspace(3) %a %a.add = fadd <2 x float> %a.load, @@ -2025,13 +2025,13 @@ define void @shared_4xfloat(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; -; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load <4 x float>, ptr addrspace(3) %a %a.add = fadd <4 x float> %a.load, @@ -2046,11 +2046,11 @@ define void @shared_2xdouble(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; -; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_2xdouble_param_0]; +; CHECK-NEXT: ld.shared.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.shared.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load <2 x double>, ptr addrspace(3) %a %a.add = fadd <2 x double> %a.load, @@ -2067,11 +2067,11 @@ define void @shared_volatile_2xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.shared.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i8>, ptr addrspace(3) %a %a.add = add <2 x i8> %a.load, @@ -2087,8 +2087,8 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -2108,7 +2108,7 @@ define void @shared_volatile_4xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr addrspace(3) %a %a.add = add <4 x i8> %a.load, @@ -2124,7 +2124,7 @@ define void @shared_volatile_8xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi8_param_0]; ; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -2180,7 +2180,7 @@ define void @shared_volatile_16xi8(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xi8_param_0]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -2274,13 +2274,13 @@ define void @shared_volatile_2xi16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: st.volatile.shared.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i16>, ptr addrspace(3) %a %a.add = add <2 x i16> %a.load, @@ -2295,13 +2295,13 @@ define void @shared_volatile_4xi16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.volatile.shared.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i16>, ptr addrspace(3) %a %a.add = add <4 x i16> %a.load, @@ -2317,7 +2317,7 @@ define void @shared_volatile_8xi16(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi16_param_0]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -2350,11 +2350,11 @@ define void @shared_volatile_2xi32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i32>, ptr addrspace(3) %a %a.add = add <2 x i32> %a.load, @@ -2369,13 +2369,13 @@ define void @shared_volatile_4xi32(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i32>, ptr addrspace(3) %a %a.add = add <4 x i32> %a.load, @@ -2389,11 +2389,11 @@ define void @shared_volatile_2xi64(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i64>, ptr addrspace(3) %a %a.add = add <2 x i64> %a.load, @@ -2408,11 +2408,11 @@ define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x float>, ptr addrspace(3) %a %a.add = fadd <2 x float> %a.load, @@ -2427,13 +2427,13 @@ define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x float>, ptr addrspace(3) %a %a.add = fadd <4 x float> %a.load, @@ -2448,11 +2448,11 @@ define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x double>, ptr addrspace(3) %a %a.add = fadd <2 x double> %a.load, @@ -2471,11 +2471,11 @@ define void @local_2xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; -; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.local.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load <2 x i8>, ptr addrspace(5) %a %a.add = add <2 x i8> %a.load, @@ -2491,8 +2491,8 @@ define void @local_4xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_4xi8_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -2512,7 +2512,7 @@ define void @local_4xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: st.local.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load <4 x i8>, ptr addrspace(5) %a %a.add = add <4 x i8> %a.load, @@ -2528,7 +2528,7 @@ define void @local_8xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi8_param_0]; ; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -2584,7 +2584,7 @@ define void @local_16xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_16xi8_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -2678,13 +2678,13 @@ define void @local_2xi16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_2xi16_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load <2 x i16>, ptr addrspace(5) %a %a.add = add <2 x i16> %a.load, @@ -2699,13 +2699,13 @@ define void @local_4xi16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; -; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load <4 x i16>, ptr addrspace(5) %a %a.add = add <4 x i16> %a.load, @@ -2721,7 +2721,7 @@ define void @local_8xi16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi16_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -2754,11 +2754,11 @@ define void @local_2xi32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; -; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load <2 x i32>, ptr addrspace(5) %a %a.add = add <2 x i32> %a.load, @@ -2773,13 +2773,13 @@ define void @local_4xi32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; -; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load <4 x i32>, ptr addrspace(5) %a %a.add = add <4 x i32> %a.load, @@ -2793,11 +2793,11 @@ define void @local_2xi64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; -; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.local.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load <2 x i64>, ptr addrspace(5) %a %a.add = add <2 x i64> %a.load, @@ -2812,11 +2812,11 @@ define void @local_2xfloat(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.local.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load <2 x float>, ptr addrspace(5) %a %a.add = fadd <2 x float> %a.load, @@ -2831,13 +2831,13 @@ define void @local_4xfloat(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load <4 x float>, ptr addrspace(5) %a %a.add = fadd <4 x float> %a.load, @@ -2852,11 +2852,11 @@ define void @local_2xdouble(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; -; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.local.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load <2 x double>, ptr addrspace(5) %a %a.add = fadd <2 x double> %a.load, @@ -2873,11 +2873,11 @@ define void @local_volatile_2xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.b8 {%rs1, %rs2}, [%rd1]; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: st.local.v2.b8 [%rd1], {%rs4, %rs3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i8>, ptr addrspace(5) %a %a.add = add <2 x i8> %a.load, @@ -2893,8 +2893,8 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: add.s16 %rs2, %rs1, 1; @@ -2914,7 +2914,7 @@ define void @local_volatile_4xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; ; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; ; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: st.local.b32 [%rd1], %r12; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i8>, ptr addrspace(5) %a %a.add = add <4 x i8> %a.load, @@ -2930,7 +2930,7 @@ define void @local_volatile_8xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi8_param_0]; ; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; @@ -2986,7 +2986,7 @@ define void @local_volatile_16xi8(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xi8_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; @@ -3080,13 +3080,13 @@ define void @local_volatile_2xi16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.local.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; ; CHECK-NEXT: add.s16 %rs4, %rs1, 1; ; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: st.local.b32 [%rd1], %r2; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i16>, ptr addrspace(5) %a %a.add = add <2 x i16> %a.load, @@ -3101,13 +3101,13 @@ define void @local_volatile_4xi16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; ; CHECK-NEXT: add.s16 %rs5, %rs4, 1; ; CHECK-NEXT: add.s16 %rs6, %rs3, 1; ; CHECK-NEXT: add.s16 %rs7, %rs2, 1; ; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: st.local.v4.b16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i16>, ptr addrspace(5) %a %a.add = add <4 x i16> %a.load, @@ -3123,7 +3123,7 @@ define void @local_volatile_8xi16(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi16_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: add.s16 %rs3, %rs2, 1; @@ -3156,11 +3156,11 @@ define void @local_volatile_2xi32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; ; CHECK-NEXT: add.s32 %r3, %r2, 1; ; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r4, %r3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i32>, ptr addrspace(5) %a %a.add = add <2 x i32> %a.load, @@ -3175,13 +3175,13 @@ define void @local_volatile_4xi32(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: add.s32 %r5, %r4, 1; ; CHECK-NEXT: add.s32 %r6, %r3, 1; ; CHECK-NEXT: add.s32 %r7, %r2, 1; ; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i32>, ptr addrspace(5) %a %a.add = add <4 x i32> %a.load, @@ -3195,11 +3195,11 @@ define void @local_volatile_2xi64(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; ; CHECK-NEXT: add.s64 %rd4, %rd3, 1; ; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: st.local.v2.b64 [%rd1], {%rd5, %rd4}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x i64>, ptr addrspace(5) %a %a.add = add <2 x i64> %a.load, @@ -3214,11 +3214,11 @@ define void @local_volatile_2xfloat(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.b32 {%f1, %f2}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: st.local.v2.b32 [%rd1], {%f4, %f3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x float>, ptr addrspace(5) %a %a.add = fadd <2 x float> %a.load, @@ -3233,13 +3233,13 @@ define void @local_volatile_4xfloat(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; ; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; ; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%f8, %f7, %f6, %f5}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x float>, ptr addrspace(5) %a %a.add = fadd <4 x float> %a.load, @@ -3254,11 +3254,11 @@ define void @local_volatile_2xdouble(ptr addrspace(5) %a) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%fd1, %fd2}, [%rd1]; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; ; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: st.local.v2.b64 [%rd1], {%fd4, %fd3}; ; CHECK-NEXT: ret; %a.load = load volatile <2 x double>, ptr addrspace(5) %a %a.add = fadd <2 x double> %a.load, diff --git a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll index 6a34135a3178..4d7a4b50e894 100644 --- a/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll +++ b/llvm/test/CodeGen/NVPTX/load-with-non-coherent-cache.ll @@ -7,9 +7,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "nvptx64-unknown-unknown" ; SM20-LABEL: .visible .entry foo1( -; SM20: ld.global.f32 +; SM20: ld.global.b32 ; SM35-LABEL: .visible .entry foo1( -; SM35: ld.global.nc.f32 +; SM35: ld.global.nc.b32 define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) { %1 = load float, ptr %from store float %1, ptr %to @@ -17,9 +17,9 @@ define ptx_kernel void @foo1(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo2( -; SM20: ld.global.f64 +; SM20: ld.global.b64 ; SM35-LABEL: .visible .entry foo2( -; SM35: ld.global.nc.f64 +; SM35: ld.global.nc.b64 define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) { %1 = load double, ptr %from store double %1, ptr %to @@ -27,9 +27,9 @@ define ptx_kernel void @foo2(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo3( -; SM20: ld.global.u16 +; SM20: ld.global.b16 ; SM35-LABEL: .visible .entry foo3( -; SM35: ld.global.nc.u16 +; SM35: ld.global.nc.b16 define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) { %1 = load i16, ptr %from store i16 %1, ptr %to @@ -37,9 +37,9 @@ define ptx_kernel void @foo3(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo4( -; SM20: ld.global.u32 +; SM20: ld.global.b32 ; SM35-LABEL: .visible .entry foo4( -; SM35: ld.global.nc.u32 +; SM35: ld.global.nc.b32 define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) { %1 = load i32, ptr %from store i32 %1, ptr %to @@ -47,9 +47,9 @@ define ptx_kernel void @foo4(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo5( -; SM20: ld.global.u64 +; SM20: ld.global.b64 ; SM35-LABEL: .visible .entry foo5( -; SM35: ld.global.nc.u64 +; SM35: ld.global.nc.b64 define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) { %1 = load i64, ptr %from store i64 %1, ptr %to @@ -58,9 +58,9 @@ define ptx_kernel void @foo5(ptr noalias readonly %from, ptr %to) { ; i128 is non standard integer in nvptx64 ; SM20-LABEL: .visible .entry foo6( -; SM20: ld.global.v2.u64 +; SM20: ld.global.v2.b64 ; SM35-LABEL: .visible .entry foo6( -; SM35: ld.global.nc.v2.u64 +; SM35: ld.global.nc.v2.b64 define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) { %1 = load i128, ptr %from store i128 %1, ptr %to @@ -68,9 +68,9 @@ define ptx_kernel void @foo6(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo7( -; SM20: ld.global.v2.u8 +; SM20: ld.global.v2.b8 ; SM35-LABEL: .visible .entry foo7( -; SM35: ld.global.nc.v2.u8 +; SM35: ld.global.nc.v2.b8 define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) { %1 = load <2 x i8>, ptr %from store <2 x i8> %1, ptr %to @@ -78,9 +78,9 @@ define ptx_kernel void @foo7(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo8( -; SM20: ld.global.u32 +; SM20: ld.global.b32 ; SM35-LABEL: .visible .entry foo8( -; SM35: ld.global.nc.u32 +; SM35: ld.global.nc.b32 define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) { %1 = load <2 x i16>, ptr %from store <2 x i16> %1, ptr %to @@ -88,9 +88,9 @@ define ptx_kernel void @foo8(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo9( -; SM20: ld.global.v2.u32 +; SM20: ld.global.v2.b32 ; SM35-LABEL: .visible .entry foo9( -; SM35: ld.global.nc.v2.u32 +; SM35: ld.global.nc.v2.b32 define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) { %1 = load <2 x i32>, ptr %from store <2 x i32> %1, ptr %to @@ -98,9 +98,9 @@ define ptx_kernel void @foo9(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo10( -; SM20: ld.global.v2.u64 +; SM20: ld.global.v2.b64 ; SM35-LABEL: .visible .entry foo10( -; SM35: ld.global.nc.v2.u64 +; SM35: ld.global.nc.v2.b64 define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { %1 = load <2 x i64>, ptr %from store <2 x i64> %1, ptr %to @@ -108,9 +108,9 @@ define ptx_kernel void @foo10(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo11( -; SM20: ld.global.v2.f32 +; SM20: ld.global.v2.b32 ; SM35-LABEL: .visible .entry foo11( -; SM35: ld.global.nc.v2.f32 +; SM35: ld.global.nc.v2.b32 define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { %1 = load <2 x float>, ptr %from store <2 x float> %1, ptr %to @@ -118,9 +118,9 @@ define ptx_kernel void @foo11(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo12( -; SM20: ld.global.v2.f64 +; SM20: ld.global.v2.b64 ; SM35-LABEL: .visible .entry foo12( -; SM35: ld.global.nc.v2.f64 +; SM35: ld.global.nc.v2.b64 define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) { %1 = load <2 x double>, ptr %from store <2 x double> %1, ptr %to @@ -128,9 +128,9 @@ define ptx_kernel void @foo12(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo13( -; SM20: ld.global.u32 +; SM20: ld.global.b32 ; SM35-LABEL: .visible .entry foo13( -; SM35: ld.global.nc.u32 +; SM35: ld.global.nc.b32 define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) { %1 = load <4 x i8>, ptr %from store <4 x i8> %1, ptr %to @@ -138,9 +138,9 @@ define ptx_kernel void @foo13(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo14( -; SM20: ld.global.v4.u16 +; SM20: ld.global.v4.b16 ; SM35-LABEL: .visible .entry foo14( -; SM35: ld.global.nc.v4.u16 +; SM35: ld.global.nc.v4.b16 define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) { %1 = load <4 x i16>, ptr %from store <4 x i16> %1, ptr %to @@ -148,9 +148,9 @@ define ptx_kernel void @foo14(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo15( -; SM20: ld.global.v4.u32 +; SM20: ld.global.v4.b32 ; SM35-LABEL: .visible .entry foo15( -; SM35: ld.global.nc.v4.u32 +; SM35: ld.global.nc.v4.b32 define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { %1 = load <4 x i32>, ptr %from store <4 x i32> %1, ptr %to @@ -158,9 +158,9 @@ define ptx_kernel void @foo15(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo16( -; SM20: ld.global.v4.f32 +; SM20: ld.global.v4.b32 ; SM35-LABEL: .visible .entry foo16( -; SM35: ld.global.nc.v4.f32 +; SM35: ld.global.nc.v4.b32 define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { %1 = load <4 x float>, ptr %from store <4 x float> %1, ptr %to @@ -168,11 +168,11 @@ define ptx_kernel void @foo16(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo17( -; SM20: ld.global.v2.f64 -; SM20: ld.global.v2.f64 +; SM20: ld.global.v2.b64 +; SM20: ld.global.v2.b64 ; SM35-LABEL: .visible .entry foo17( -; SM35: ld.global.nc.v2.f64 -; SM35: ld.global.nc.v2.f64 +; SM35: ld.global.nc.v2.b64 +; SM35: ld.global.nc.v2.b64 define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) { %1 = load <4 x double>, ptr %from store <4 x double> %1, ptr %to @@ -180,9 +180,9 @@ define ptx_kernel void @foo17(ptr noalias readonly %from, ptr %to) { } ; SM20-LABEL: .visible .entry foo18( -; SM20: ld.global.u64 +; SM20: ld.global.b64 ; SM35-LABEL: .visible .entry foo18( -; SM35: ld.global.nc.u64 +; SM35: ld.global.nc.b64 define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) { %1 = load ptr, ptr %from store ptr %1, ptr %to @@ -191,9 +191,9 @@ define ptx_kernel void @foo18(ptr noalias readonly %from, ptr %to) { ; Test that we can infer a cached load for a pointer induction variable. ; SM20-LABEL: .visible .entry foo19( -; SM20: ld.global.f32 +; SM20: ld.global.b32 ; SM35-LABEL: .visible .entry foo19( -; SM35: ld.global.nc.f32 +; SM35: ld.global.nc.b32 define ptx_kernel void @foo19(ptr noalias readonly %from, ptr %to, i32 %n) { entry: br label %loop @@ -219,9 +219,9 @@ exit: ; pointed-to memory is never written to (for the duration of the ; kernel). For both reasons, we cannot use a cached load here. ; SM20-LABEL: notkernel( -; SM20: ld.f32 +; SM20: ld.b32 ; SM35-LABEL: notkernel( -; SM35: ld.f32 +; SM35: ld.b32 define void @notkernel(ptr noalias readonly %from, ptr %to) { %1 = load float, ptr %from store float %1, ptr %to @@ -233,9 +233,9 @@ define void @notkernel(ptr noalias readonly %from, ptr %to) { ; kernel). This case does not currently come up normally since we do not infer ; that pointers are global interprocedurally as of 2015-08-05. ; SM20-LABEL: notkernel2( -; SM20: ld.global.f32 +; SM20: ld.global.b32 ; SM35-LABEL: notkernel2( -; SM35: ld.global.f32 +; SM35: ld.global.b32 define void @notkernel2(ptr addrspace(1) noalias readonly %from, ptr %to) { %1 = load float, ptr addrspace(1) %from store float %1, ptr %to diff --git a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll index f49053485fa2..2bfd891a04a1 100644 --- a/llvm/test/CodeGen/NVPTX/local-stack-frame.ll +++ b/llvm/test/CodeGen/NVPTX/local-stack-frame.ll @@ -16,9 +16,9 @@ define void @foo(i32 %a) { ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot0; -; PTX32-NEXT: ld.param.u32 %r1, [foo_param_0]; +; PTX32-NEXT: ld.param.b32 %r1, [foo_param_0]; ; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: st.local.u32 [%r3], %r1; +; PTX32-NEXT: st.local.b32 [%r3], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo( @@ -31,9 +31,9 @@ define void @foo(i32 %a) { ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot0; -; PTX64-NEXT: ld.param.u32 %r1, [foo_param_0]; +; PTX64-NEXT: ld.param.b32 %r1, [foo_param_0]; ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: st.local.u32 [%rd2], %r1; +; PTX64-NEXT: st.local.b32 [%rd2], %r1; ; PTX64-NEXT: ret; %local = alloca i32, align 4 store volatile i32 %a, ptr %local @@ -51,10 +51,10 @@ define ptx_kernel void @foo2(i32 %a) { ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot1; ; PTX32-NEXT: cvta.local.u32 %SP, %SPL; -; PTX32-NEXT: ld.param.u32 %r1, [foo2_param_0]; +; PTX32-NEXT: ld.param.b32 %r1, [foo2_param_0]; ; PTX32-NEXT: add.u32 %r2, %SP, 0; ; PTX32-NEXT: add.u32 %r3, %SPL, 0; -; PTX32-NEXT: st.local.u32 [%r3], %r1; +; PTX32-NEXT: st.local.b32 [%r3], %r1; ; PTX32-NEXT: { // callseq 0, 0 ; PTX32-NEXT: .param .b32 param0; ; PTX32-NEXT: st.param.b32 [param0], %r2; @@ -77,10 +77,10 @@ define ptx_kernel void @foo2(i32 %a) { ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot1; ; PTX64-NEXT: cvta.local.u64 %SP, %SPL; -; PTX64-NEXT: ld.param.u32 %r1, [foo2_param_0]; +; PTX64-NEXT: ld.param.b32 %r1, [foo2_param_0]; ; PTX64-NEXT: add.u64 %rd1, %SP, 0; ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; -; PTX64-NEXT: st.local.u32 [%rd2], %r1; +; PTX64-NEXT: st.local.b32 [%rd2], %r1; ; PTX64-NEXT: { // callseq 0, 0 ; PTX64-NEXT: .param .b64 param0; ; PTX64-NEXT: st.param.b64 [param0], %rd1; @@ -109,11 +109,11 @@ define void @foo3(i32 %a) { ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot2; -; PTX32-NEXT: ld.param.u32 %r1, [foo3_param_0]; +; PTX32-NEXT: ld.param.b32 %r1, [foo3_param_0]; ; PTX32-NEXT: add.u32 %r3, %SPL, 0; ; PTX32-NEXT: shl.b32 %r4, %r1, 2; ; PTX32-NEXT: add.s32 %r5, %r3, %r4; -; PTX32-NEXT: st.local.u32 [%r5], %r1; +; PTX32-NEXT: st.local.b32 [%r5], %r1; ; PTX32-NEXT: ret; ; ; PTX64-LABEL: foo3( @@ -126,11 +126,11 @@ define void @foo3(i32 %a) { ; PTX64-EMPTY: ; PTX64-NEXT: // %bb.0: ; PTX64-NEXT: mov.b64 %SPL, __local_depot2; -; PTX64-NEXT: ld.param.u32 %r1, [foo3_param_0]; +; PTX64-NEXT: ld.param.b32 %r1, [foo3_param_0]; ; PTX64-NEXT: add.u64 %rd2, %SPL, 0; ; PTX64-NEXT: mul.wide.s32 %rd3, %r1, 4; ; PTX64-NEXT: add.s64 %rd4, %rd2, %rd3; -; PTX64-NEXT: st.local.u32 [%rd4], %r1; +; PTX64-NEXT: st.local.b32 [%rd4], %r1; ; PTX64-NEXT: ret; %local = alloca [3 x i32], align 4 %1 = getelementptr inbounds i32, ptr %local, i32 %a @@ -154,8 +154,8 @@ define void @foo4() { ; PTX32-NEXT: add.u32 %r3, %SP, 4; ; PTX32-NEXT: add.u32 %r4, %SPL, 4; ; PTX32-NEXT: mov.b32 %r5, 0; -; PTX32-NEXT: st.local.u32 [%r2], %r5; -; PTX32-NEXT: st.local.u32 [%r4], %r5; +; PTX32-NEXT: st.local.b32 [%r2], %r5; +; PTX32-NEXT: st.local.b32 [%r4], %r5; ; PTX32-NEXT: { // callseq 1, 0 ; PTX32-NEXT: .param .b32 param0; ; PTX32-NEXT: st.param.b32 [param0], %r1; @@ -192,8 +192,8 @@ define void @foo4() { ; PTX64-NEXT: add.u64 %rd3, %SP, 4; ; PTX64-NEXT: add.u64 %rd4, %SPL, 4; ; PTX64-NEXT: mov.b32 %r1, 0; -; PTX64-NEXT: st.local.u32 [%rd2], %r1; -; PTX64-NEXT: st.local.u32 [%rd4], %r1; +; PTX64-NEXT: st.local.b32 [%rd2], %r1; +; PTX64-NEXT: st.local.b32 [%rd4], %r1; ; PTX64-NEXT: { // callseq 1, 0 ; PTX64-NEXT: .param .b64 param0; ; PTX64-NEXT: st.param.b64 [param0], %rd1; diff --git a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll index 720c95b51358..99212fc0dff7 100644 --- a/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ b/llvm/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -37,8 +37,8 @@ entry: ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller ; PTX: $L__BB[[LABEL:[_0-9]+]]: -; PTX: ld.u8 %rs[[REG:[0-9]+]] -; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] +; PTX: ld.b8 %rs[[REG:[0-9]+]] +; PTX: st.b8 [%rd{{[0-9]+}}], %rs[[REG]] ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra $L__BB[[LABEL]] @@ -71,8 +71,8 @@ entry: ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_volatile_caller ; PTX: $L__BB[[LABEL:[_0-9]+]]: -; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]] -; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]] +; PTX: ld.volatile.b8 %rs[[REG:[0-9]+]] +; PTX: st.volatile.b8 [%rd{{[0-9]+}}], %rs[[REG]] ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra $L__BB[[LABEL]] @@ -124,10 +124,10 @@ entry: ; IR-NEXT: store i8 [[VAL]], ptr [[STOREPTR]] ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller( -; PTX: ld.param.u32 %r[[C:[0-9]+]] +; PTX: ld.param.b32 %r[[C:[0-9]+]] ; PTX: cvt.u16.u32 %rs[[REG:[0-9]+]], %r[[C]]; ; PTX: $L__BB[[LABEL:[_0-9]+]]: -; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]] +; PTX: st.b8 [%rd{{[0-9]+}}], %rs[[REG]] ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra $L__BB[[LABEL]] @@ -159,20 +159,20 @@ entry: ; IR: {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1 ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memmove_caller( -; PTX: ld.param.u64 %rd[[N:[0-9]+]] +; PTX: ld.param.b64 %rd[[N:[0-9]+]] ; PTX-DAG: setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0 ; PTX-DAG: setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}} ; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra $L__BB[[FORWARD_BB:[0-9_]+]] ; -- this is the backwards copying BB ; PTX: @%p[[NEQ0]] bra $L__BB[[EXIT:[0-9_]+]] ; PTX: add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1 -; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]] -; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] +; PTX: ld.b8 %rs[[ELEMENT:[0-9]+]] +; PTX: st.b8 [%rd{{[0-9]+}}], %rs[[ELEMENT]] ; -- this is the forwards copying BB ; PTX: $L__BB[[FORWARD_BB]]: ; PTX: @%p[[NEQ0]] bra $L__BB[[EXIT]] -; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]] -; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] +; PTX: ld.b8 %rs[[ELEMENT2:[0-9]+]] +; PTX: st.b8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]] ; PTX: add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1 ; -- exit block ; PTX: $L__BB[[EXIT]]: diff --git a/llvm/test/CodeGen/NVPTX/lower-alloca.ll b/llvm/test/CodeGen/NVPTX/lower-alloca.ll index 530b48b3d3e3..489bcf4a7d55 100644 --- a/llvm/test/CodeGen/NVPTX/lower-alloca.ll +++ b/llvm/test/CodeGen/NVPTX/lower-alloca.ll @@ -15,7 +15,7 @@ define ptx_kernel void @kernel() { ; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr %A to ptr addrspace(5) ; LOWERALLOCAONLY: [[V2:%.*]] = addrspacecast ptr addrspace(5) [[V1]] to ptr ; LOWERALLOCAONLY: store i32 0, ptr [[V2]], align 4 -; PTX: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}} +; PTX: st.local.b32 [{{%rd[0-9]+}}], {{%r[0-9]+}} store i32 0, ptr %A call void @callee(ptr %A) ret void @@ -26,7 +26,7 @@ define void @alloca_in_explicit_local_as() { ; PTX-LABEL: .visible .func alloca_in_explicit_local_as( %A = alloca i32, addrspace(5) ; CHECK: store i32 0, ptr addrspace(5) {{%.+}} -; PTX: st.local.u32 [%SP], {{%r[0-9]+}} +; PTX: st.local.b32 [%SP], {{%r[0-9]+}} ; LOWERALLOCAONLY: [[V1:%.*]] = addrspacecast ptr addrspace(5) %A to ptr ; LOWERALLOCAONLY: store i32 0, ptr [[V1]], align 4 store i32 0, ptr addrspace(5) %A diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll index dd172cf68538..c3f94455b303 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll @@ -28,7 +28,7 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %rd1, non_kernel_function_param_0; ; PTX-NEXT: cvta.local.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u8 %rs1, [non_kernel_function_param_1]; +; PTX-NEXT: ld.param.b8 %rs1, [non_kernel_function_param_1]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX-NEXT: mov.b64 %rd3, gi; @@ -36,13 +36,13 @@ define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly ; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; ; PTX-NEXT: ld.param.s32 %rd6, [non_kernel_function_param_2]; ; PTX-NEXT: add.s64 %rd7, %rd5, %rd6; -; PTX-NEXT: ld.u8 %r1, [%rd7]; -; PTX-NEXT: ld.u8 %r2, [%rd7+1]; +; PTX-NEXT: ld.b8 %r1, [%rd7]; +; PTX-NEXT: ld.b8 %r2, [%rd7+1]; ; PTX-NEXT: shl.b32 %r3, %r2, 8; ; PTX-NEXT: or.b32 %r4, %r3, %r1; -; PTX-NEXT: ld.u8 %r5, [%rd7+2]; +; PTX-NEXT: ld.b8 %r5, [%rd7+2]; ; PTX-NEXT: shl.b32 %r6, %r5, 16; -; PTX-NEXT: ld.u8 %r7, [%rd7+3]; +; PTX-NEXT: ld.b8 %r7, [%rd7+3]; ; PTX-NEXT: shl.b32 %r8, %r7, 24; ; PTX-NEXT: or.b32 %r9, %r8, %r6; ; PTX-NEXT: or.b32 %r10, %r9, %r4; @@ -63,12 +63,12 @@ define ptx_kernel void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %inpu ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [grid_const_int_param_2]; +; PTX-NEXT: ld.param.b64 %rd1, [grid_const_int_param_2]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u32 %r1, [grid_const_int_param_1]; -; PTX-NEXT: ld.param.u32 %r2, [grid_const_int_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [grid_const_int_param_1]; +; PTX-NEXT: ld.param.b32 %r2, [grid_const_int_param_0]; ; PTX-NEXT: add.s32 %r3, %r2, %r1; -; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: st.global.b32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_int( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { @@ -92,12 +92,12 @@ define ptx_kernel void @grid_const_struct(ptr byval(%struct.s) align 4 %input, p ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [grid_const_struct_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [grid_const_struct_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u32 %r1, [grid_const_struct_param_0]; -; PTX-NEXT: ld.param.u32 %r2, [grid_const_struct_param_0+4]; +; PTX-NEXT: ld.param.b32 %r1, [grid_const_struct_param_0]; +; PTX-NEXT: ld.param.b32 %r2, [grid_const_struct_param_0+4]; ; PTX-NEXT: add.s32 %r3, %r1, %r2; -; PTX-NEXT: st.global.u32 [%rd2], %r3; +; PTX-NEXT: st.global.b32 [%rd2], %r3; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_struct( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] { @@ -165,13 +165,13 @@ define ptx_kernel void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 ; PTX-NEXT: mov.b64 %SPL, __local_depot4; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: mov.b64 %rd2, multiple_grid_const_escape_param_0; -; PTX-NEXT: ld.param.u32 %r1, [multiple_grid_const_escape_param_1]; +; PTX-NEXT: ld.param.b32 %r1, [multiple_grid_const_escape_param_1]; ; PTX-NEXT: mov.b64 %rd3, multiple_grid_const_escape_param_2; ; PTX-NEXT: cvta.param.u64 %rd4, %rd3; ; PTX-NEXT: cvta.param.u64 %rd5, %rd2; ; PTX-NEXT: add.u64 %rd6, %SP, 0; ; PTX-NEXT: add.u64 %rd7, %SPL, 0; -; PTX-NEXT: st.local.u32 [%rd7], %r1; +; PTX-NEXT: st.local.b32 [%rd7], %r1; ; PTX-NEXT: mov.b64 %rd1, escape3; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; @@ -216,10 +216,10 @@ define ptx_kernel void @grid_const_memory_escape(ptr byval(%struct.s) align 4 %i ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, grid_const_memory_escape_param_0; -; PTX-NEXT: ld.param.u64 %rd2, [grid_const_memory_escape_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [grid_const_memory_escape_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; ; PTX-NEXT: cvta.param.u64 %rd4, %rd1; -; PTX-NEXT: st.global.u64 [%rd3], %rd4; +; PTX-NEXT: st.global.b64 [%rd3], %rd4; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_memory_escape( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT:%.*]], ptr [[ADDR:%.*]]) #[[ATTR0]] { @@ -238,14 +238,14 @@ define ptx_kernel void @grid_const_inlineasm_escape(ptr byval(%struct.s) align 4 ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd4, grid_const_inlineasm_escape_param_0; -; PTX-NEXT: ld.param.u64 %rd5, [grid_const_inlineasm_escape_param_1]; +; PTX-NEXT: ld.param.b64 %rd5, [grid_const_inlineasm_escape_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd6, %rd5; ; PTX-NEXT: cvta.param.u64 %rd2, %rd4; ; PTX-NEXT: add.s64 %rd3, %rd2, 4; ; PTX-NEXT: // begin inline asm ; PTX-NEXT: add.s64 %rd1, %rd2, %rd3; ; PTX-NEXT: // end inline asm -; PTX-NEXT: st.global.u64 [%rd6], %rd1; +; PTX-NEXT: st.global.b64 [%rd6], %rd1; ; PTX-NEXT: ret; ; PTX-NOT .local ; OPT-LABEL: define ptx_kernel void @grid_const_inlineasm_escape( @@ -272,12 +272,12 @@ define ptx_kernel void @grid_const_partial_escape(ptr byval(i32) %input, ptr %ou ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escape_param_0; -; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escape_param_1]; +; PTX-NEXT: ld.param.b64 %rd3, [grid_const_partial_escape_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3; ; PTX-NEXT: cvta.param.u64 %rd5, %rd2; -; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escape_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escape_param_0]; ; PTX-NEXT: add.s32 %r2, %r1, %r1; -; PTX-NEXT: st.global.u32 [%rd4], %r2; +; PTX-NEXT: st.global.b32 [%rd4], %r2; ; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .b64 param0; @@ -317,12 +317,12 @@ define ptx_kernel i32 @grid_const_partial_escapemem(ptr byval(%struct.s) %input, ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd2, grid_const_partial_escapemem_param_0; -; PTX-NEXT: ld.param.u64 %rd3, [grid_const_partial_escapemem_param_1]; +; PTX-NEXT: ld.param.b64 %rd3, [grid_const_partial_escapemem_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd4, %rd3; ; PTX-NEXT: cvta.param.u64 %rd5, %rd2; -; PTX-NEXT: ld.param.u32 %r1, [grid_const_partial_escapemem_param_0]; -; PTX-NEXT: ld.param.u32 %r2, [grid_const_partial_escapemem_param_0+4]; -; PTX-NEXT: st.global.u64 [%rd4], %rd5; +; PTX-NEXT: ld.param.b32 %r1, [grid_const_partial_escapemem_param_0]; +; PTX-NEXT: ld.param.b32 %r2, [grid_const_partial_escapemem_param_0+4]; +; PTX-NEXT: st.global.b64 [%rd4], %rd5; ; PTX-NEXT: add.s32 %r3, %r1, %r2; ; PTX-NEXT: mov.b64 %rd1, escape; ; PTX-NEXT: { // callseq 3, 0 @@ -371,16 +371,16 @@ define ptx_kernel void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd6, grid_const_phi_param_0; -; PTX-NEXT: ld.param.u64 %rd5, [grid_const_phi_param_1]; +; PTX-NEXT: ld.param.b64 %rd5, [grid_const_phi_param_1]; ; PTX-NEXT: cvta.to.global.u64 %rd1, %rd5; -; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: ld.global.b32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB9_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: add.s64 %rd6, %rd6, 4; ; PTX-NEXT: $L__BB9_2: // %merge -; PTX-NEXT: ld.param.u32 %r2, [%rd6]; -; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ld.param.b32 %r2, [%rd6]; +; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { @@ -427,17 +427,17 @@ define ptx_kernel void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd7, grid_const_phi_ngc_param_0; -; PTX-NEXT: ld.param.u64 %rd6, [grid_const_phi_ngc_param_2]; +; PTX-NEXT: ld.param.b64 %rd6, [grid_const_phi_ngc_param_2]; ; PTX-NEXT: cvta.to.global.u64 %rd1, %rd6; -; PTX-NEXT: ld.global.u32 %r1, [%rd1]; +; PTX-NEXT: ld.global.b32 %r1, [%rd1]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: @%p1 bra $L__BB10_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: mov.b64 %rd2, grid_const_phi_ngc_param_1; ; PTX-NEXT: add.s64 %rd7, %rd2, 4; ; PTX-NEXT: $L__BB10_2: // %merge -; PTX-NEXT: ld.param.u32 %r2, [%rd7]; -; PTX-NEXT: st.global.u32 [%rd1], %r2; +; PTX-NEXT: ld.param.b32 %r2, [%rd7]; +; PTX-NEXT: st.global.b32 [%rd1], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_phi_ngc( ; OPT-SAME: ptr byval([[STRUCT_S:%.*]]) align 4 [[INPUT1:%.*]], ptr byval([[STRUCT_S]]) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { @@ -485,14 +485,14 @@ define ptx_kernel void @grid_const_select(ptr byval(i32) align 4 %input1, ptr by ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, grid_const_select_param_0; -; PTX-NEXT: ld.param.u64 %rd2, [grid_const_select_param_2]; +; PTX-NEXT: ld.param.b64 %rd2, [grid_const_select_param_2]; ; PTX-NEXT: cvta.to.global.u64 %rd3, %rd2; ; PTX-NEXT: mov.b64 %rd4, grid_const_select_param_1; -; PTX-NEXT: ld.global.u32 %r1, [%rd3]; +; PTX-NEXT: ld.global.b32 %r1, [%rd3]; ; PTX-NEXT: setp.lt.s32 %p1, %r1, 0; ; PTX-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; -; PTX-NEXT: ld.param.u32 %r2, [%rd5]; -; PTX-NEXT: st.global.u32 [%rd3], %r2; +; PTX-NEXT: ld.param.b32 %r2, [%rd5]; +; PTX-NEXT: st.global.b32 [%rd3], %r2; ; PTX-NEXT: ret; ; OPT-LABEL: define ptx_kernel void @grid_const_select( ; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], ptr byval(i32) [[INPUT2:%.*]], ptr [[INOUT:%.*]]) #[[ATTR0]] { @@ -522,7 +522,7 @@ define ptx_kernel i32 @grid_const_ptrtoint(ptr byval(i32) %input) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %rd1, grid_const_ptrtoint_param_0; -; PTX-NEXT: ld.param.u32 %r1, [grid_const_ptrtoint_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [grid_const_ptrtoint_param_0]; ; PTX-NEXT: cvta.param.u64 %rd2, %rd1; ; PTX-NEXT: cvt.u32.u64 %r2, %rd2; ; PTX-NEXT: add.s32 %r3, %r1, %r2; @@ -557,7 +557,7 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b32 %r<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; ; PTX-NEXT: { // callseq 4, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; ; PTX-NEXT: st.param.b32 [param0], %r1; diff --git a/llvm/test/CodeGen/NVPTX/lower-args.ll b/llvm/test/CodeGen/NVPTX/lower-args.ll index 8e879871e295..246408ecf6a3 100644 --- a/llvm/test/CodeGen/NVPTX/lower-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-args.ll @@ -35,14 +35,14 @@ define void @load_alignment(ptr nocapture readonly byval(%class.outer) align 8 % ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %rd1, load_alignment_param_0; -; PTX-NEXT: ld.local.u64 %rd2, [%rd1]; -; PTX-NEXT: ld.local.u64 %rd3, [%rd1+8]; +; PTX-NEXT: ld.local.b64 %rd2, [%rd1]; +; PTX-NEXT: ld.local.b64 %rd3, [%rd1+8]; ; PTX-NEXT: add.s64 %rd4, %rd1, 16; ; PTX-NEXT: cvta.local.u64 %rd5, %rd4; -; PTX-NEXT: ld.local.u32 %r1, [%rd1+16]; -; PTX-NEXT: ld.u32 %r2, [%rd2]; +; PTX-NEXT: ld.local.b32 %r1, [%rd1+16]; +; PTX-NEXT: ld.b32 %r2, [%rd2]; ; PTX-NEXT: add.s32 %r3, %r2, %r1; -; PTX-NEXT: st.u32 [%rd3], %r3; +; PTX-NEXT: st.b32 [%rd3], %r3; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; ; PTX-NEXT: st.param.b64 [param0], %rd5; @@ -116,12 +116,12 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { ; PTXC-NEXT: .reg .b64 %rd<5>; ; PTXC-EMPTY: ; PTXC-NEXT: // %bb.0: -; PTXC-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0]; -; PTXC-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1]; +; PTXC-NEXT: ld.param.b64 %rd1, [ptr_generic_param_0]; +; PTXC-NEXT: ld.param.b64 %rd2, [ptr_generic_param_1]; ; PTXC-NEXT: cvta.to.global.u64 %rd3, %rd2; ; PTXC-NEXT: cvta.to.global.u64 %rd4, %rd1; -; PTXC-NEXT: ld.global.u32 %r1, [%rd3]; -; PTXC-NEXT: st.global.u32 [%rd4], %r1; +; PTXC-NEXT: ld.global.b32 %r1, [%rd3]; +; PTXC-NEXT: st.global.b32 [%rd4], %r1; ; PTXC-NEXT: ret; ; ; PTXO-LABEL: ptr_generic( @@ -130,10 +130,10 @@ define ptx_kernel void @ptr_generic(ptr %out, ptr %in) { ; PTXO-NEXT: .reg .b64 %rd<3>; ; PTXO-EMPTY: ; PTXO-NEXT: // %bb.0: -; PTXO-NEXT: ld.param.u64 %rd1, [ptr_generic_param_0]; -; PTXO-NEXT: ld.param.u64 %rd2, [ptr_generic_param_1]; -; PTXO-NEXT: ld.u32 %r1, [%rd2]; -; PTXO-NEXT: st.u32 [%rd1], %r1; +; PTXO-NEXT: ld.param.b64 %rd1, [ptr_generic_param_0]; +; PTXO-NEXT: ld.param.b64 %rd2, [ptr_generic_param_1]; +; PTXO-NEXT: ld.b32 %r1, [%rd2]; +; PTXO-NEXT: st.b32 [%rd1], %r1; ; PTXO-NEXT: ret; %v = load i32, ptr %in, align 4 store i32 %v, ptr %out, align 4 @@ -153,10 +153,10 @@ define ptx_kernel void @ptr_nongeneric(ptr addrspace(1) %out, ptr addrspace(3) % ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ptr_nongeneric_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ptr_nongeneric_param_1]; -; PTX-NEXT: ld.shared.u32 %r1, [%rd2]; -; PTX-NEXT: st.global.u32 [%rd1], %r1; +; PTX-NEXT: ld.param.b64 %rd1, [ptr_nongeneric_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ptr_nongeneric_param_1]; +; PTX-NEXT: ld.shared.b32 %r1, [%rd2]; +; PTX-NEXT: st.global.b32 [%rd1], %r1; ; PTX-NEXT: ret; %v = load i32, ptr addrspace(3) %in, align 4 store i32 %v, ptr addrspace(1) %out, align 4 @@ -184,10 +184,10 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { ; PTXC-NEXT: .reg .b64 %rd<3>; ; PTXC-EMPTY: ; PTXC-NEXT: // %bb.0: -; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0]; -; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1]; +; PTXC-NEXT: ld.param.b64 %rd1, [ptr_as_int_param_0]; +; PTXC-NEXT: ld.param.b32 %r1, [ptr_as_int_param_1]; ; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTXC-NEXT: st.global.u32 [%rd2], %r1; +; PTXC-NEXT: st.global.b32 [%rd2], %r1; ; PTXC-NEXT: ret; ; ; PTXO-LABEL: ptr_as_int( @@ -196,9 +196,9 @@ define ptx_kernel void @ptr_as_int(i64 noundef %i, i32 noundef %v) { ; PTXO-NEXT: .reg .b64 %rd<2>; ; PTXO-EMPTY: ; PTXO-NEXT: // %bb.0: -; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_param_0]; -; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_param_1]; -; PTXO-NEXT: st.u32 [%rd1], %r1; +; PTXO-NEXT: ld.param.b64 %rd1, [ptr_as_int_param_0]; +; PTXO-NEXT: ld.param.b32 %r1, [ptr_as_int_param_1]; +; PTXO-NEXT: st.b32 [%rd1], %r1; ; PTXO-NEXT: ret; %p = inttoptr i64 %i to ptr store i32 %v, ptr %p, align 4 @@ -232,10 +232,10 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st ; PTXC-NEXT: .reg .b64 %rd<3>; ; PTXC-EMPTY: ; PTXC-NEXT: // %bb.0: -; PTXC-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1]; -; PTXC-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0]; +; PTXC-NEXT: ld.param.b32 %r1, [ptr_as_int_aggr_param_1]; +; PTXC-NEXT: ld.param.b64 %rd1, [ptr_as_int_aggr_param_0]; ; PTXC-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTXC-NEXT: st.global.u32 [%rd2], %r1; +; PTXC-NEXT: st.global.b32 [%rd2], %r1; ; PTXC-NEXT: ret; ; ; PTXO-LABEL: ptr_as_int_aggr( @@ -244,9 +244,9 @@ define ptx_kernel void @ptr_as_int_aggr(ptr nocapture noundef readonly byval(%st ; PTXO-NEXT: .reg .b64 %rd<2>; ; PTXO-EMPTY: ; PTXO-NEXT: // %bb.0: -; PTXO-NEXT: ld.param.u32 %r1, [ptr_as_int_aggr_param_1]; -; PTXO-NEXT: ld.param.u64 %rd1, [ptr_as_int_aggr_param_0]; -; PTXO-NEXT: st.u32 [%rd1], %r1; +; PTXO-NEXT: ld.param.b32 %r1, [ptr_as_int_aggr_param_1]; +; PTXO-NEXT: ld.param.b64 %rd1, [ptr_as_int_aggr_param_0]; +; PTXO-NEXT: st.b32 [%rd1], %r1; ; PTXO-NEXT: ret; %i = load i64, ptr %s, align 8 %p = inttoptr i64 %i to ptr diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll index 1304ffe42c7b..54495cf0d61f 100644 --- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll +++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll @@ -50,10 +50,10 @@ define dso_local ptx_kernel void @read_only(ptr nocapture noundef writeonly %out ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: ld.param.u64 %rd1, [read_only_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [read_only_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u32 %r1, [read_only_param_1]; -; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ld.param.b32 %r1, [read_only_param_1]; +; PTX-NEXT: st.global.b32 [%rd2], %r1; ; PTX-NEXT: ret; entry: %i = load i32, ptr %s, align 4 @@ -86,10 +86,10 @@ define dso_local ptx_kernel void @read_only_gep(ptr nocapture noundef writeonly ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: ld.param.u64 %rd1, [read_only_gep_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [read_only_gep_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX-NEXT: ld.param.u32 %r1, [read_only_gep_param_1+4]; -; PTX-NEXT: st.global.u32 [%rd2], %r1; +; PTX-NEXT: ld.param.b32 %r1, [read_only_gep_param_1+4]; +; PTX-NEXT: st.global.b32 [%rd2], %r1; ; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -146,10 +146,10 @@ define dso_local ptx_kernel void @escape_ptr(ptr nocapture noundef readnone %out ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: add.u64 %rd1, %SP, 0; ; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_param_1+4]; -; PTX-NEXT: st.local.u32 [%rd2+4], %r1; -; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_param_1]; -; PTX-NEXT: st.local.u32 [%rd2], %r2; +; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_param_1+4]; +; PTX-NEXT: st.local.b32 [%rd2+4], %r1; +; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_param_1]; +; PTX-NEXT: st.local.b32 [%rd2], %r2; ; PTX-NEXT: { // callseq 0, 0 ; PTX-NEXT: .param .b64 param0; ; PTX-NEXT: st.param.b64 [param0], %rd1; @@ -190,10 +190,10 @@ define dso_local ptx_kernel void @escape_ptr_gep(ptr nocapture noundef readnone ; PTX-NEXT: cvta.local.u64 %SP, %SPL; ; PTX-NEXT: add.u64 %rd1, %SP, 0; ; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_param_1+4]; -; PTX-NEXT: st.local.u32 [%rd2+4], %r1; -; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_param_1]; -; PTX-NEXT: st.local.u32 [%rd2], %r2; +; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_gep_param_1+4]; +; PTX-NEXT: st.local.b32 [%rd2+4], %r1; +; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_param_1]; +; PTX-NEXT: st.local.b32 [%rd2], %r2; ; PTX-NEXT: add.s64 %rd3, %rd1, 4; ; PTX-NEXT: { // callseq 1, 0 ; PTX-NEXT: .param .b64 param0; @@ -233,15 +233,15 @@ define dso_local ptx_kernel void @escape_ptr_store(ptr nocapture noundef writeon ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot4; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; -; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_store_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [escape_ptr_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: add.u64 %rd3, %SP, 0; ; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_store_param_1+4]; -; PTX-NEXT: st.local.u32 [%rd4+4], %r1; -; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_store_param_1]; -; PTX-NEXT: st.local.u32 [%rd4], %r2; -; PTX-NEXT: st.global.u64 [%rd2], %rd3; +; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_store_param_1+4]; +; PTX-NEXT: st.local.b32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_store_param_1]; +; PTX-NEXT: st.local.b32 [%rd4], %r2; +; PTX-NEXT: st.global.b64 [%rd2], %rd3; ; PTX-NEXT: ret; entry: store ptr %s, ptr %out, align 8 @@ -271,16 +271,16 @@ define dso_local ptx_kernel void @escape_ptr_gep_store(ptr nocapture noundef wri ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot5; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; -; PTX-NEXT: ld.param.u64 %rd1, [escape_ptr_gep_store_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [escape_ptr_gep_store_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: add.u64 %rd3, %SP, 0; ; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [escape_ptr_gep_store_param_1+4]; -; PTX-NEXT: st.local.u32 [%rd4+4], %r1; -; PTX-NEXT: ld.param.u32 %r2, [escape_ptr_gep_store_param_1]; -; PTX-NEXT: st.local.u32 [%rd4], %r2; +; PTX-NEXT: ld.param.b32 %r1, [escape_ptr_gep_store_param_1+4]; +; PTX-NEXT: st.local.b32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.b32 %r2, [escape_ptr_gep_store_param_1]; +; PTX-NEXT: st.local.b32 [%rd4], %r2; ; PTX-NEXT: add.s64 %rd5, %rd3, 4; -; PTX-NEXT: st.global.u64 [%rd2], %rd5; +; PTX-NEXT: st.global.b64 [%rd2], %rd5; ; PTX-NEXT: ret; entry: %b = getelementptr inbounds nuw i8, ptr %s, i64 4 @@ -311,15 +311,15 @@ define dso_local ptx_kernel void @escape_ptrtoint(ptr nocapture noundef writeonl ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot6; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; -; PTX-NEXT: ld.param.u64 %rd1, [escape_ptrtoint_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [escape_ptrtoint_param_0]; ; PTX-NEXT: cvta.to.global.u64 %rd2, %rd1; ; PTX-NEXT: add.u64 %rd3, %SP, 0; ; PTX-NEXT: add.u64 %rd4, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [escape_ptrtoint_param_1+4]; -; PTX-NEXT: st.local.u32 [%rd4+4], %r1; -; PTX-NEXT: ld.param.u32 %r2, [escape_ptrtoint_param_1]; -; PTX-NEXT: st.local.u32 [%rd4], %r2; -; PTX-NEXT: st.global.u64 [%rd2], %rd3; +; PTX-NEXT: ld.param.b32 %r1, [escape_ptrtoint_param_1+4]; +; PTX-NEXT: st.local.b32 [%rd4+4], %r1; +; PTX-NEXT: ld.param.b32 %r2, [escape_ptrtoint_param_1]; +; PTX-NEXT: st.local.b32 [%rd4], %r2; +; PTX-NEXT: st.global.b64 [%rd2], %rd3; ; PTX-NEXT: ret; entry: %i = ptrtoint ptr %s to i64 @@ -348,39 +348,39 @@ define dso_local ptx_kernel void @memcpy_from_param(ptr nocapture noundef writeo ; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_param_0]; -; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_param_1+15]; -; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; -; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_param_1+14]; -; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; -; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_param_1+13]; -; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; -; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_param_1+12]; -; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; -; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_param_1+11]; -; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; -; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_param_1+10]; -; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; -; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_param_1+9]; -; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; -; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_param_1+8]; -; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; -; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_param_1+7]; -; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; -; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_param_1+6]; -; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; -; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_param_1+5]; -; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; -; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_param_1+4]; -; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; -; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_param_1+3]; -; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; -; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_param_1+2]; -; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; -; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_param_1+1]; -; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; -; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_param_1]; -; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; +; PTX-NEXT: ld.param.b64 %rd1, [memcpy_from_param_param_0]; +; PTX-NEXT: ld.param.b8 %rs1, [memcpy_from_param_param_1+15]; +; PTX-NEXT: st.volatile.b8 [%rd1+15], %rs1; +; PTX-NEXT: ld.param.b8 %rs2, [memcpy_from_param_param_1+14]; +; PTX-NEXT: st.volatile.b8 [%rd1+14], %rs2; +; PTX-NEXT: ld.param.b8 %rs3, [memcpy_from_param_param_1+13]; +; PTX-NEXT: st.volatile.b8 [%rd1+13], %rs3; +; PTX-NEXT: ld.param.b8 %rs4, [memcpy_from_param_param_1+12]; +; PTX-NEXT: st.volatile.b8 [%rd1+12], %rs4; +; PTX-NEXT: ld.param.b8 %rs5, [memcpy_from_param_param_1+11]; +; PTX-NEXT: st.volatile.b8 [%rd1+11], %rs5; +; PTX-NEXT: ld.param.b8 %rs6, [memcpy_from_param_param_1+10]; +; PTX-NEXT: st.volatile.b8 [%rd1+10], %rs6; +; PTX-NEXT: ld.param.b8 %rs7, [memcpy_from_param_param_1+9]; +; PTX-NEXT: st.volatile.b8 [%rd1+9], %rs7; +; PTX-NEXT: ld.param.b8 %rs8, [memcpy_from_param_param_1+8]; +; PTX-NEXT: st.volatile.b8 [%rd1+8], %rs8; +; PTX-NEXT: ld.param.b8 %rs9, [memcpy_from_param_param_1+7]; +; PTX-NEXT: st.volatile.b8 [%rd1+7], %rs9; +; PTX-NEXT: ld.param.b8 %rs10, [memcpy_from_param_param_1+6]; +; PTX-NEXT: st.volatile.b8 [%rd1+6], %rs10; +; PTX-NEXT: ld.param.b8 %rs11, [memcpy_from_param_param_1+5]; +; PTX-NEXT: st.volatile.b8 [%rd1+5], %rs11; +; PTX-NEXT: ld.param.b8 %rs12, [memcpy_from_param_param_1+4]; +; PTX-NEXT: st.volatile.b8 [%rd1+4], %rs12; +; PTX-NEXT: ld.param.b8 %rs13, [memcpy_from_param_param_1+3]; +; PTX-NEXT: st.volatile.b8 [%rd1+3], %rs13; +; PTX-NEXT: ld.param.b8 %rs14, [memcpy_from_param_param_1+2]; +; PTX-NEXT: st.volatile.b8 [%rd1+2], %rs14; +; PTX-NEXT: ld.param.b8 %rs15, [memcpy_from_param_param_1+1]; +; PTX-NEXT: st.volatile.b8 [%rd1+1], %rs15; +; PTX-NEXT: ld.param.b8 %rs16, [memcpy_from_param_param_1]; +; PTX-NEXT: st.volatile.b8 [%rd1], %rs16; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) @@ -408,39 +408,39 @@ define dso_local ptx_kernel void @memcpy_from_param_noalign (ptr nocapture nound ; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: // %entry -; PTX-NEXT: ld.param.u64 %rd1, [memcpy_from_param_noalign_param_0]; -; PTX-NEXT: ld.param.u8 %rs1, [memcpy_from_param_noalign_param_1+15]; -; PTX-NEXT: st.volatile.u8 [%rd1+15], %rs1; -; PTX-NEXT: ld.param.u8 %rs2, [memcpy_from_param_noalign_param_1+14]; -; PTX-NEXT: st.volatile.u8 [%rd1+14], %rs2; -; PTX-NEXT: ld.param.u8 %rs3, [memcpy_from_param_noalign_param_1+13]; -; PTX-NEXT: st.volatile.u8 [%rd1+13], %rs3; -; PTX-NEXT: ld.param.u8 %rs4, [memcpy_from_param_noalign_param_1+12]; -; PTX-NEXT: st.volatile.u8 [%rd1+12], %rs4; -; PTX-NEXT: ld.param.u8 %rs5, [memcpy_from_param_noalign_param_1+11]; -; PTX-NEXT: st.volatile.u8 [%rd1+11], %rs5; -; PTX-NEXT: ld.param.u8 %rs6, [memcpy_from_param_noalign_param_1+10]; -; PTX-NEXT: st.volatile.u8 [%rd1+10], %rs6; -; PTX-NEXT: ld.param.u8 %rs7, [memcpy_from_param_noalign_param_1+9]; -; PTX-NEXT: st.volatile.u8 [%rd1+9], %rs7; -; PTX-NEXT: ld.param.u8 %rs8, [memcpy_from_param_noalign_param_1+8]; -; PTX-NEXT: st.volatile.u8 [%rd1+8], %rs8; -; PTX-NEXT: ld.param.u8 %rs9, [memcpy_from_param_noalign_param_1+7]; -; PTX-NEXT: st.volatile.u8 [%rd1+7], %rs9; -; PTX-NEXT: ld.param.u8 %rs10, [memcpy_from_param_noalign_param_1+6]; -; PTX-NEXT: st.volatile.u8 [%rd1+6], %rs10; -; PTX-NEXT: ld.param.u8 %rs11, [memcpy_from_param_noalign_param_1+5]; -; PTX-NEXT: st.volatile.u8 [%rd1+5], %rs11; -; PTX-NEXT: ld.param.u8 %rs12, [memcpy_from_param_noalign_param_1+4]; -; PTX-NEXT: st.volatile.u8 [%rd1+4], %rs12; -; PTX-NEXT: ld.param.u8 %rs13, [memcpy_from_param_noalign_param_1+3]; -; PTX-NEXT: st.volatile.u8 [%rd1+3], %rs13; -; PTX-NEXT: ld.param.u8 %rs14, [memcpy_from_param_noalign_param_1+2]; -; PTX-NEXT: st.volatile.u8 [%rd1+2], %rs14; -; PTX-NEXT: ld.param.u8 %rs15, [memcpy_from_param_noalign_param_1+1]; -; PTX-NEXT: st.volatile.u8 [%rd1+1], %rs15; -; PTX-NEXT: ld.param.u8 %rs16, [memcpy_from_param_noalign_param_1]; -; PTX-NEXT: st.volatile.u8 [%rd1], %rs16; +; PTX-NEXT: ld.param.b64 %rd1, [memcpy_from_param_noalign_param_0]; +; PTX-NEXT: ld.param.b8 %rs1, [memcpy_from_param_noalign_param_1+15]; +; PTX-NEXT: st.volatile.b8 [%rd1+15], %rs1; +; PTX-NEXT: ld.param.b8 %rs2, [memcpy_from_param_noalign_param_1+14]; +; PTX-NEXT: st.volatile.b8 [%rd1+14], %rs2; +; PTX-NEXT: ld.param.b8 %rs3, [memcpy_from_param_noalign_param_1+13]; +; PTX-NEXT: st.volatile.b8 [%rd1+13], %rs3; +; PTX-NEXT: ld.param.b8 %rs4, [memcpy_from_param_noalign_param_1+12]; +; PTX-NEXT: st.volatile.b8 [%rd1+12], %rs4; +; PTX-NEXT: ld.param.b8 %rs5, [memcpy_from_param_noalign_param_1+11]; +; PTX-NEXT: st.volatile.b8 [%rd1+11], %rs5; +; PTX-NEXT: ld.param.b8 %rs6, [memcpy_from_param_noalign_param_1+10]; +; PTX-NEXT: st.volatile.b8 [%rd1+10], %rs6; +; PTX-NEXT: ld.param.b8 %rs7, [memcpy_from_param_noalign_param_1+9]; +; PTX-NEXT: st.volatile.b8 [%rd1+9], %rs7; +; PTX-NEXT: ld.param.b8 %rs8, [memcpy_from_param_noalign_param_1+8]; +; PTX-NEXT: st.volatile.b8 [%rd1+8], %rs8; +; PTX-NEXT: ld.param.b8 %rs9, [memcpy_from_param_noalign_param_1+7]; +; PTX-NEXT: st.volatile.b8 [%rd1+7], %rs9; +; PTX-NEXT: ld.param.b8 %rs10, [memcpy_from_param_noalign_param_1+6]; +; PTX-NEXT: st.volatile.b8 [%rd1+6], %rs10; +; PTX-NEXT: ld.param.b8 %rs11, [memcpy_from_param_noalign_param_1+5]; +; PTX-NEXT: st.volatile.b8 [%rd1+5], %rs11; +; PTX-NEXT: ld.param.b8 %rs12, [memcpy_from_param_noalign_param_1+4]; +; PTX-NEXT: st.volatile.b8 [%rd1+4], %rs12; +; PTX-NEXT: ld.param.b8 %rs13, [memcpy_from_param_noalign_param_1+3]; +; PTX-NEXT: st.volatile.b8 [%rd1+3], %rs13; +; PTX-NEXT: ld.param.b8 %rs14, [memcpy_from_param_noalign_param_1+2]; +; PTX-NEXT: st.volatile.b8 [%rd1+2], %rs14; +; PTX-NEXT: ld.param.b8 %rs15, [memcpy_from_param_noalign_param_1+1]; +; PTX-NEXT: st.volatile.b8 [%rd1+1], %rs15; +; PTX-NEXT: ld.param.b8 %rs16, [memcpy_from_param_noalign_param_1]; +; PTX-NEXT: st.volatile.b8 [%rd1], %rs16; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true) @@ -469,58 +469,58 @@ define dso_local ptx_kernel void @memcpy_to_param(ptr nocapture noundef readonly ; PTX-NEXT: // %bb.0: // %entry ; PTX-NEXT: mov.b64 %SPL, __local_depot9; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; -; PTX-NEXT: ld.param.u64 %rd1, [memcpy_to_param_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [memcpy_to_param_param_0]; ; PTX-NEXT: add.u64 %rd3, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [memcpy_to_param_param_1+4]; -; PTX-NEXT: st.local.u32 [%rd3+4], %r1; -; PTX-NEXT: ld.param.u32 %r2, [memcpy_to_param_param_1]; -; PTX-NEXT: st.local.u32 [%rd3], %r2; -; PTX-NEXT: ld.volatile.u8 %rd4, [%rd1]; -; PTX-NEXT: ld.volatile.u8 %rd5, [%rd1+1]; +; PTX-NEXT: ld.param.b32 %r1, [memcpy_to_param_param_1+4]; +; PTX-NEXT: st.local.b32 [%rd3+4], %r1; +; PTX-NEXT: ld.param.b32 %r2, [memcpy_to_param_param_1]; +; PTX-NEXT: st.local.b32 [%rd3], %r2; +; PTX-NEXT: ld.volatile.b8 %rd4, [%rd1]; +; PTX-NEXT: ld.volatile.b8 %rd5, [%rd1+1]; ; PTX-NEXT: shl.b64 %rd6, %rd5, 8; ; PTX-NEXT: or.b64 %rd7, %rd6, %rd4; -; PTX-NEXT: ld.volatile.u8 %rd8, [%rd1+2]; +; PTX-NEXT: ld.volatile.b8 %rd8, [%rd1+2]; ; PTX-NEXT: shl.b64 %rd9, %rd8, 16; -; PTX-NEXT: ld.volatile.u8 %rd10, [%rd1+3]; +; PTX-NEXT: ld.volatile.b8 %rd10, [%rd1+3]; ; PTX-NEXT: shl.b64 %rd11, %rd10, 24; ; PTX-NEXT: or.b64 %rd12, %rd11, %rd9; ; PTX-NEXT: or.b64 %rd13, %rd12, %rd7; -; PTX-NEXT: ld.volatile.u8 %rd14, [%rd1+4]; -; PTX-NEXT: ld.volatile.u8 %rd15, [%rd1+5]; +; PTX-NEXT: ld.volatile.b8 %rd14, [%rd1+4]; +; PTX-NEXT: ld.volatile.b8 %rd15, [%rd1+5]; ; PTX-NEXT: shl.b64 %rd16, %rd15, 8; ; PTX-NEXT: or.b64 %rd17, %rd16, %rd14; -; PTX-NEXT: ld.volatile.u8 %rd18, [%rd1+6]; +; PTX-NEXT: ld.volatile.b8 %rd18, [%rd1+6]; ; PTX-NEXT: shl.b64 %rd19, %rd18, 16; -; PTX-NEXT: ld.volatile.u8 %rd20, [%rd1+7]; +; PTX-NEXT: ld.volatile.b8 %rd20, [%rd1+7]; ; PTX-NEXT: shl.b64 %rd21, %rd20, 24; ; PTX-NEXT: or.b64 %rd22, %rd21, %rd19; ; PTX-NEXT: or.b64 %rd23, %rd22, %rd17; ; PTX-NEXT: shl.b64 %rd24, %rd23, 32; ; PTX-NEXT: or.b64 %rd25, %rd24, %rd13; -; PTX-NEXT: st.volatile.u64 [%SP], %rd25; -; PTX-NEXT: ld.volatile.u8 %rd26, [%rd1+8]; -; PTX-NEXT: ld.volatile.u8 %rd27, [%rd1+9]; +; PTX-NEXT: st.volatile.b64 [%SP], %rd25; +; PTX-NEXT: ld.volatile.b8 %rd26, [%rd1+8]; +; PTX-NEXT: ld.volatile.b8 %rd27, [%rd1+9]; ; PTX-NEXT: shl.b64 %rd28, %rd27, 8; ; PTX-NEXT: or.b64 %rd29, %rd28, %rd26; -; PTX-NEXT: ld.volatile.u8 %rd30, [%rd1+10]; +; PTX-NEXT: ld.volatile.b8 %rd30, [%rd1+10]; ; PTX-NEXT: shl.b64 %rd31, %rd30, 16; -; PTX-NEXT: ld.volatile.u8 %rd32, [%rd1+11]; +; PTX-NEXT: ld.volatile.b8 %rd32, [%rd1+11]; ; PTX-NEXT: shl.b64 %rd33, %rd32, 24; ; PTX-NEXT: or.b64 %rd34, %rd33, %rd31; ; PTX-NEXT: or.b64 %rd35, %rd34, %rd29; -; PTX-NEXT: ld.volatile.u8 %rd36, [%rd1+12]; -; PTX-NEXT: ld.volatile.u8 %rd37, [%rd1+13]; +; PTX-NEXT: ld.volatile.b8 %rd36, [%rd1+12]; +; PTX-NEXT: ld.volatile.b8 %rd37, [%rd1+13]; ; PTX-NEXT: shl.b64 %rd38, %rd37, 8; ; PTX-NEXT: or.b64 %rd39, %rd38, %rd36; -; PTX-NEXT: ld.volatile.u8 %rd40, [%rd1+14]; +; PTX-NEXT: ld.volatile.b8 %rd40, [%rd1+14]; ; PTX-NEXT: shl.b64 %rd41, %rd40, 16; -; PTX-NEXT: ld.volatile.u8 %rd42, [%rd1+15]; +; PTX-NEXT: ld.volatile.b8 %rd42, [%rd1+15]; ; PTX-NEXT: shl.b64 %rd43, %rd42, 24; ; PTX-NEXT: or.b64 %rd44, %rd43, %rd41; ; PTX-NEXT: or.b64 %rd45, %rd44, %rd39; ; PTX-NEXT: shl.b64 %rd46, %rd45, 32; ; PTX-NEXT: or.b64 %rd47, %rd46, %rd35; -; PTX-NEXT: st.volatile.u64 [%SP+8], %rd47; +; PTX-NEXT: st.volatile.b64 [%SP+8], %rd47; ; PTX-NEXT: ret; entry: tail call void @llvm.memcpy.p0.p0.i64(ptr %s, ptr %in, i64 16, i1 true) @@ -600,15 +600,15 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; PTX_60-NEXT: .reg .b64 %rd<3>; ; PTX_60-EMPTY: ; PTX_60-NEXT: // %bb.0: // %bb -; PTX_60-NEXT: ld.param.u8 %rs1, [test_select_param_3]; +; PTX_60-NEXT: ld.param.b8 %rs1, [test_select_param_3]; ; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_60-NEXT: ld.param.u64 %rd1, [test_select_param_2]; +; PTX_60-NEXT: ld.param.b64 %rd1, [test_select_param_2]; ; PTX_60-NEXT: cvta.to.global.u64 %rd2, %rd1; -; PTX_60-NEXT: ld.param.u32 %r1, [test_select_param_1]; -; PTX_60-NEXT: ld.param.u32 %r2, [test_select_param_0]; +; PTX_60-NEXT: ld.param.b32 %r1, [test_select_param_1]; +; PTX_60-NEXT: ld.param.b32 %r2, [test_select_param_0]; ; PTX_60-NEXT: selp.b32 %r3, %r2, %r1, %p1; -; PTX_60-NEXT: st.global.u32 [%rd2], %r3; +; PTX_60-NEXT: st.global.b32 [%rd2], %r3; ; PTX_60-NEXT: ret; ; ; PTX_70-LABEL: test_select( @@ -619,16 +619,16 @@ define ptx_kernel void @test_select(ptr byval(i32) align 4 %input1, ptr byval(i3 ; PTX_70-NEXT: .reg .b64 %rd<6>; ; PTX_70-EMPTY: ; PTX_70-NEXT: // %bb.0: // %bb -; PTX_70-NEXT: ld.param.u8 %rs1, [test_select_param_3]; +; PTX_70-NEXT: ld.param.b8 %rs1, [test_select_param_3]; ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX_70-NEXT: mov.b64 %rd1, test_select_param_0; -; PTX_70-NEXT: ld.param.u64 %rd2, [test_select_param_2]; +; PTX_70-NEXT: ld.param.b64 %rd2, [test_select_param_2]; ; PTX_70-NEXT: cvta.to.global.u64 %rd3, %rd2; ; PTX_70-NEXT: mov.b64 %rd4, test_select_param_1; ; PTX_70-NEXT: selp.b64 %rd5, %rd1, %rd4, %p1; -; PTX_70-NEXT: ld.param.u32 %r1, [%rd5]; -; PTX_70-NEXT: st.global.u32 [%rd3], %r1; +; PTX_70-NEXT: ld.param.b32 %r1, [%rd5]; +; PTX_70-NEXT: st.global.b32 [%rd3], %r1; ; PTX_70-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 @@ -664,18 +664,18 @@ define ptx_kernel void @test_select_write(ptr byval(i32) align 4 %input1, ptr by ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot12; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; -; PTX-NEXT: ld.param.u8 %rs1, [test_select_write_param_3]; +; PTX-NEXT: ld.param.b8 %rs1, [test_select_write_param_3]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX-NEXT: ld.param.u32 %r1, [test_select_write_param_1]; -; PTX-NEXT: st.u32 [%SP], %r1; -; PTX-NEXT: ld.param.u32 %r2, [test_select_write_param_0]; -; PTX-NEXT: st.u32 [%SP+4], %r2; +; PTX-NEXT: ld.param.b32 %r1, [test_select_write_param_1]; +; PTX-NEXT: st.b32 [%SP], %r1; +; PTX-NEXT: ld.param.b32 %r2, [test_select_write_param_0]; +; PTX-NEXT: st.b32 [%SP+4], %r2; ; PTX-NEXT: add.u64 %rd2, %SPL, 4; ; PTX-NEXT: add.u64 %rd4, %SPL, 0; ; PTX-NEXT: selp.b64 %rd5, %rd2, %rd4, %p1; ; PTX-NEXT: mov.b32 %r3, 1; -; PTX-NEXT: st.local.u32 [%rd5], %r3; +; PTX-NEXT: st.local.b32 [%rd5], %r3; ; PTX-NEXT: ret; bb: %ptrnew = select i1 %cond, ptr %input1, ptr %input2 @@ -756,17 +756,17 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_60-NEXT: .reg .b64 %rd<3>; ; PTX_60-EMPTY: ; PTX_60-NEXT: // %bb.0: // %bb -; PTX_60-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; +; PTX_60-NEXT: ld.param.b8 %rs1, [test_phi_param_3]; ; PTX_60-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_60-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX_60-NEXT: ld.param.u64 %rd2, [test_phi_param_2]; +; PTX_60-NEXT: ld.param.b64 %rd2, [test_phi_param_2]; ; PTX_60-NEXT: cvta.to.global.u64 %rd1, %rd2; -; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_0]; +; PTX_60-NEXT: ld.param.b32 %r4, [test_phi_param_0]; ; PTX_60-NEXT: @%p1 bra $L__BB13_2; ; PTX_60-NEXT: // %bb.1: // %second -; PTX_60-NEXT: ld.param.u32 %r4, [test_phi_param_1+4]; +; PTX_60-NEXT: ld.param.b32 %r4, [test_phi_param_1+4]; ; PTX_60-NEXT: $L__BB13_2: // %merge -; PTX_60-NEXT: st.global.u32 [%rd1], %r4; +; PTX_60-NEXT: st.global.b32 [%rd1], %r4; ; PTX_60-NEXT: ret; ; ; PTX_70-LABEL: test_phi( @@ -777,19 +777,19 @@ define ptx_kernel void @test_phi(ptr byval(%struct.S) align 4 %input1, ptr byval ; PTX_70-NEXT: .reg .b64 %rd<8>; ; PTX_70-EMPTY: ; PTX_70-NEXT: // %bb.0: // %bb -; PTX_70-NEXT: ld.param.u8 %rs1, [test_phi_param_3]; +; PTX_70-NEXT: ld.param.b8 %rs1, [test_phi_param_3]; ; PTX_70-NEXT: and.b16 %rs2, %rs1, 1; ; PTX_70-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX_70-NEXT: mov.b64 %rd7, test_phi_param_0; -; PTX_70-NEXT: ld.param.u64 %rd6, [test_phi_param_2]; +; PTX_70-NEXT: ld.param.b64 %rd6, [test_phi_param_2]; ; PTX_70-NEXT: cvta.to.global.u64 %rd1, %rd6; ; PTX_70-NEXT: @%p1 bra $L__BB13_2; ; PTX_70-NEXT: // %bb.1: // %second ; PTX_70-NEXT: mov.b64 %rd2, test_phi_param_1; ; PTX_70-NEXT: add.s64 %rd7, %rd2, 4; ; PTX_70-NEXT: $L__BB13_2: // %merge -; PTX_70-NEXT: ld.param.u32 %r1, [%rd7]; -; PTX_70-NEXT: st.global.u32 [%rd1], %r1; +; PTX_70-NEXT: ld.param.b32 %r1, [%rd7]; +; PTX_70-NEXT: st.global.b32 [%rd1], %r1; ; PTX_70-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -844,21 +844,21 @@ define ptx_kernel void @test_phi_write(ptr byval(%struct.S) align 4 %input1, ptr ; PTX-NEXT: // %bb.0: // %bb ; PTX-NEXT: mov.b64 %SPL, __local_depot14; ; PTX-NEXT: cvta.local.u64 %SP, %SPL; -; PTX-NEXT: ld.param.u8 %rs1, [test_phi_write_param_2]; +; PTX-NEXT: ld.param.b8 %rs1, [test_phi_write_param_2]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; ; PTX-NEXT: add.u64 %rd1, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [test_phi_write_param_1+4]; -; PTX-NEXT: st.u32 [%SP], %r1; +; PTX-NEXT: ld.param.b32 %r1, [test_phi_write_param_1+4]; +; PTX-NEXT: st.b32 [%SP], %r1; ; PTX-NEXT: add.u64 %rd6, %SPL, 4; -; PTX-NEXT: ld.param.u32 %r2, [test_phi_write_param_0]; -; PTX-NEXT: st.u32 [%SP+4], %r2; +; PTX-NEXT: ld.param.b32 %r2, [test_phi_write_param_0]; +; PTX-NEXT: st.b32 [%SP+4], %r2; ; PTX-NEXT: @%p1 bra $L__BB14_2; ; PTX-NEXT: // %bb.1: // %second ; PTX-NEXT: mov.b64 %rd6, %rd1; ; PTX-NEXT: $L__BB14_2: // %merge ; PTX-NEXT: mov.b32 %r3, 1; -; PTX-NEXT: st.local.u32 [%rd6], %r3; +; PTX-NEXT: st.local.b32 [%rd6], %r3; ; PTX-NEXT: ret; bb: br i1 %cond, label %first, label %second @@ -897,8 +897,8 @@ define ptx_kernel void @test_forward_byval_arg(ptr byval(i32) align 4 %input) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: mov.b64 %SPL, __local_depot15; ; PTX-NEXT: add.u64 %rd2, %SPL, 0; -; PTX-NEXT: ld.param.u32 %r1, [test_forward_byval_arg_param_0]; -; PTX-NEXT: st.local.u32 [%rd2], %r1; +; PTX-NEXT: ld.param.b32 %r1, [test_forward_byval_arg_param_0]; +; PTX-NEXT: st.local.b32 [%rd2], %r1; ; PTX-NEXT: { // callseq 2, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; ; PTX-NEXT: st.param.b32 [param0], %r1; @@ -925,7 +925,7 @@ define void @device_func(ptr byval(i32) align 4 %input) { ; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u32 %r1, [device_func_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [device_func_param_0]; ; PTX-NEXT: { // callseq 3, 0 ; PTX-NEXT: .param .align 4 .b8 param0[4]; ; PTX-NEXT: st.param.b32 [param0], %r1; diff --git a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll index 2e64c2559481..5022684adf71 100644 --- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll +++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll @@ -11,9 +11,9 @@ define ptx_kernel void @kernel(ptr %input, ptr %output) { ; CHECK: cvta.to.global.u64 ; CHECK: cvta.to.global.u64 %1 = load float, ptr %input, align 4 -; CHECK: ld.global.f32 +; CHECK: ld.global.b32 store float %1, ptr %output, align 4 -; CHECK: st.global.f32 +; CHECK: st.global.b32 ret void } @@ -21,9 +21,9 @@ define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %outpu ; CHECK-LABEL: .visible .entry kernel2( ; CHECK-NOT: cvta.to.global.u64 %1 = load float, ptr addrspace(1) %input, align 4 -; CHECK: ld.global.f32 +; CHECK: ld.global.b32 store float %1, ptr addrspace(1) %output, align 4 -; CHECK: st.global.f32 +; CHECK: st.global.b32 ret void } @@ -31,16 +31,16 @@ define ptx_kernel void @kernel2(ptr addrspace(1) %input, ptr addrspace(1) %outpu define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %output) { ; CHECK-LABEL: .visible .entry ptr_in_byval_kernel( -; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1] +; CHECK: ld.param.b64 %[[optr:rd.*]], [ptr_in_byval_kernel_param_1] ; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]]; -; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] +; CHECK: ld.param.b64 %[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8] ; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]]; %b_ptr = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1 %b = load ptr, ptr %b_ptr, align 8 %v = load i32, ptr %b, align 4 -; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]] +; CHECK: ld.global.b32 %[[val:.*]], [%[[iptr_g]]] store i32 %v, ptr %output, align 4 -; CHECK: st.global.u32 [%[[optr_g]]], %[[val]] +; CHECK: st.global.b32 [%[[optr_g]]], %[[val]] ret void } @@ -49,14 +49,14 @@ define ptx_kernel void @ptr_in_byval_kernel(ptr byval(%struct.S) %input, ptr %ou ; There's also no assumption that all pointers within are in global space. define void @ptr_in_byval_func(ptr byval(%struct.S) %input, ptr %output) { ; CHECK-LABEL: .visible .func ptr_in_byval_func( -; CHECK: ld.param.u64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] -; CHECK: ld.param.u64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] +; CHECK: ld.param.b64 %[[optr:rd.*]], [ptr_in_byval_func_param_1] +; CHECK: ld.param.b64 %[[iptr:rd.*]], [ptr_in_byval_func_param_0+8] %b_ptr = getelementptr inbounds %struct.S, ptr %input, i64 0, i32 1 %b = load ptr, ptr %b_ptr, align 8 %v = load i32, ptr %b, align 4 -; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]] +; CHECK: ld.b32 %[[val:.*]], [%[[iptr]]] store i32 %v, ptr %output, align 4 -; CHECK: st.u32 [%[[optr]]], %[[val]] +; CHECK: st.b32 [%[[optr]]], %[[val]] ret void } diff --git a/llvm/test/CodeGen/NVPTX/machine-sink.ll b/llvm/test/CodeGen/NVPTX/machine-sink.ll index bcd19df1f6a6..ce16a41a11ff 100644 --- a/llvm/test/CodeGen/NVPTX/machine-sink.ll +++ b/llvm/test/CodeGen/NVPTX/machine-sink.ll @@ -17,8 +17,8 @@ define float @post_dominate(float %x, i1 %cond) { entry: %0 = load float, ptr addrspacecast (ptr addrspace(3) @scalar1 to ptr), align 4 %1 = load float, ptr addrspacecast (ptr addrspace(3) @scalar2 to ptr), align 4 -; CHECK: ld.shared.f32 -; CHECK: ld.shared.f32 +; CHECK: ld.shared.b32 +; CHECK: ld.shared.b32 %2 = fmul float %0, %0 %3 = fmul float %1, %2 ; CHECK-NOT: bra diff --git a/llvm/test/CodeGen/NVPTX/match.ll b/llvm/test/CodeGen/NVPTX/match.ll index 4e783e8009f0..ae01b0d3cc7e 100644 --- a/llvm/test/CodeGen/NVPTX/match.ll +++ b/llvm/test/CodeGen/NVPTX/match.ll @@ -6,8 +6,8 @@ declare i32 @llvm.nvvm.match.any.sync.i64(i32, i64) ; CHECK-LABEL: .func{{.*}}match_any_sync_i32 define i32 @match_any_sync_i32(i32 %mask, i32 %value) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match_any_sync_i32_param_0]; - ; CHECK: ld.param.u32 [[VALUE:%r[0-9]+]], [match_any_sync_i32_param_1]; + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]], [match_any_sync_i32_param_0]; + ; CHECK: ld.param.b32 [[VALUE:%r[0-9]+]], [match_any_sync_i32_param_1]; ; CHECK: match.any.sync.b32 [[V0:%r[0-9]+]], [[VALUE]], [[MASK]]; %v0 = call i32 @llvm.nvvm.match.any.sync.i32(i32 %mask, i32 %value) @@ -25,8 +25,8 @@ define i32 @match_any_sync_i32(i32 %mask, i32 %value) { ; CHECK-LABEL: .func{{.*}}match_any_sync_i64 define i32 @match_any_sync_i64(i32 %mask, i64 %value) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match_any_sync_i64_param_0]; - ; CHECK: ld.param.u64 [[VALUE:%rd[0-9]+]], [match_any_sync_i64_param_1]; + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]], [match_any_sync_i64_param_0]; + ; CHECK: ld.param.b64 [[VALUE:%rd[0-9]+]], [match_any_sync_i64_param_1]; ; CHECK: match.any.sync.b64 [[V0:%r[0-9]+]], [[VALUE]], [[MASK]]; %v0 = call i32 @llvm.nvvm.match.any.sync.i64(i32 %mask, i64 %value) @@ -47,8 +47,8 @@ declare {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32, i64) ; CHECK-LABEL: .func{{.*}}match_all_sync_i32p( define {i32,i1} @match_all_sync_i32p(i32 %mask, i32 %value) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match_all_sync_i32p_param_0]; - ; CHECK: ld.param.u32 [[VALUE:%r[0-9]+]], [match_all_sync_i32p_param_1]; + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]], [match_all_sync_i32p_param_0]; + ; CHECK: ld.param.b32 [[VALUE:%r[0-9]+]], [match_all_sync_i32p_param_1]; ; CHECK: match.all.sync.b32 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]]; %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i32p(i32 %mask, i32 %value) @@ -83,8 +83,8 @@ define {i32,i1} @match_all_sync_i32p(i32 %mask, i32 %value) { ; CHECK-LABEL: .func{{.*}}match_all_sync_i64p( define {i32,i1} @match_all_sync_i64p(i32 %mask, i64 %value) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]], [match_all_sync_i64p_param_0]; - ; CHECK: ld.param.u64 [[VALUE:%rd[0-9]+]], [match_all_sync_i64p_param_1]; + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]], [match_all_sync_i64p_param_0]; + ; CHECK: ld.param.b64 [[VALUE:%rd[0-9]+]], [match_all_sync_i64p_param_1]; ; CHECK: match.all.sync.b64 {{%r[0-9]+\|%p[0-9]+}}, [[VALUE]], [[MASK]]; %r1 = call {i32, i1} @llvm.nvvm.match.all.sync.i64p(i32 %mask, i64 %value) diff --git a/llvm/test/CodeGen/NVPTX/math-intrins.ll b/llvm/test/CodeGen/NVPTX/math-intrins.ll index a6d01c16c0ab..c7c1ea84f9a3 100644 --- a/llvm/test/CodeGen/NVPTX/math-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/math-intrins.ll @@ -53,9 +53,9 @@ define float @ceil_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [ceil_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [ceil_float_param_0]; ; CHECK-NEXT: cvt.rpi.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.ceil.f32(float %a) ret float %b @@ -67,9 +67,9 @@ define float @ceil_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [ceil_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [ceil_float_ftz_param_0]; ; CHECK-NEXT: cvt.rpi.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.ceil.f32(float %a) ret float %b @@ -81,9 +81,9 @@ define double @ceil_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [ceil_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [ceil_double_param_0]; ; CHECK-NEXT: cvt.rpi.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.ceil.f64(double %a) ret double %b @@ -97,9 +97,9 @@ define float @floor_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [floor_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [floor_float_param_0]; ; CHECK-NEXT: cvt.rmi.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.floor.f32(float %a) ret float %b @@ -111,9 +111,9 @@ define float @floor_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [floor_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [floor_float_ftz_param_0]; ; CHECK-NEXT: cvt.rmi.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.floor.f32(float %a) ret float %b @@ -125,9 +125,9 @@ define double @floor_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [floor_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [floor_double_param_0]; ; CHECK-NEXT: cvt.rmi.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.floor.f64(double %a) ret double %b @@ -144,7 +144,7 @@ define float @round_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [round_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [round_float_param_0]; ; CHECK-NEXT: mov.b32 %r1, %f1; ; CHECK-NEXT: and.b32 %r2, %r1, -2147483648; ; CHECK-NEXT: or.b32 %r3, %r2, 1056964608; @@ -157,7 +157,7 @@ define float @round_float(float %a) { ; CHECK-NEXT: cvt.rzi.f32.f32 %f7, %f1; ; CHECK-NEXT: setp.lt.f32 %p2, %f5, 0f3F000000; ; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f8; +; CHECK-NEXT: st.param.b32 [func_retval0], %f8; ; CHECK-NEXT: ret; %b = call float @llvm.round.f32(float %a) ret float %b @@ -172,7 +172,7 @@ define float @round_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [round_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [round_float_ftz_param_0]; ; CHECK-NEXT: mov.b32 %r1, %f1; ; CHECK-NEXT: and.b32 %r2, %r1, -2147483648; ; CHECK-NEXT: or.b32 %r3, %r2, 1056964608; @@ -185,7 +185,7 @@ define float @round_float_ftz(float %a) #1 { ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f7, %f1; ; CHECK-NEXT: setp.lt.ftz.f32 %p2, %f5, 0f3F000000; ; CHECK-NEXT: selp.f32 %f8, %f7, %f6, %p2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f8; +; CHECK-NEXT: st.param.b32 [func_retval0], %f8; ; CHECK-NEXT: ret; %b = call float @llvm.round.f32(float %a) ret float %b @@ -199,7 +199,7 @@ define double @round_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [round_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [round_double_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; ; CHECK-NEXT: setp.lt.f64 %p1, %fd2, 0d3FE0000000000000; ; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FE0000000000000; @@ -208,7 +208,7 @@ define double @round_double(double %a) { ; CHECK-NEXT: copysign.f64 %fd6, %fd1, %fd5; ; CHECK-NEXT: setp.gt.f64 %p2, %fd2, 0d4330000000000000; ; CHECK-NEXT: selp.f64 %fd7, %fd1, %fd6, %p2; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd7; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd7; ; CHECK-NEXT: ret; %b = call double @llvm.round.f64(double %a) ret double %b @@ -222,9 +222,9 @@ define float @nearbyint_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [nearbyint_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [nearbyint_float_param_0]; ; CHECK-NEXT: cvt.rni.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.nearbyint.f32(float %a) ret float %b @@ -236,9 +236,9 @@ define float @nearbyint_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [nearbyint_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [nearbyint_float_ftz_param_0]; ; CHECK-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.nearbyint.f32(float %a) ret float %b @@ -250,9 +250,9 @@ define double @nearbyint_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [nearbyint_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [nearbyint_double_param_0]; ; CHECK-NEXT: cvt.rni.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.nearbyint.f64(double %a) ret double %b @@ -266,9 +266,9 @@ define float @rint_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [rint_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [rint_float_param_0]; ; CHECK-NEXT: cvt.rni.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.rint.f32(float %a) ret float %b @@ -280,9 +280,9 @@ define float @rint_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [rint_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [rint_float_ftz_param_0]; ; CHECK-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.rint.f32(float %a) ret float %b @@ -294,9 +294,9 @@ define double @rint_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [rint_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [rint_double_param_0]; ; CHECK-NEXT: cvt.rni.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.rint.f64(double %a) ret double %b @@ -310,9 +310,9 @@ define float @roundeven_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [roundeven_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [roundeven_float_param_0]; ; CHECK-NEXT: cvt.rni.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.roundeven.f32(float %a) ret float %b @@ -324,9 +324,9 @@ define float @roundeven_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [roundeven_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [roundeven_float_ftz_param_0]; ; CHECK-NEXT: cvt.rni.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.roundeven.f32(float %a) ret float %b @@ -338,9 +338,9 @@ define double @roundeven_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [roundeven_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [roundeven_double_param_0]; ; CHECK-NEXT: cvt.rni.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.roundeven.f64(double %a) ret double %b @@ -354,9 +354,9 @@ define float @trunc_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [trunc_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [trunc_float_param_0]; ; CHECK-NEXT: cvt.rzi.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.trunc.f32(float %a) ret float %b @@ -368,9 +368,9 @@ define float @trunc_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [trunc_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [trunc_float_ftz_param_0]; ; CHECK-NEXT: cvt.rzi.ftz.f32.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.trunc.f32(float %a) ret float %b @@ -382,9 +382,9 @@ define double @trunc_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [trunc_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [trunc_double_param_0]; ; CHECK-NEXT: cvt.rzi.f64.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.trunc.f64(double %a) ret double %b @@ -398,9 +398,9 @@ define float @abs_float(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [abs_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [abs_float_param_0]; ; CHECK-NEXT: abs.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.fabs.f32(float %a) ret float %b @@ -412,9 +412,9 @@ define float @abs_float_ftz(float %a) #1 { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [abs_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [abs_float_ftz_param_0]; ; CHECK-NEXT: abs.ftz.f32 %f2, %f1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %b = call float @llvm.fabs.f32(float %a) ret float %b @@ -426,9 +426,9 @@ define double @abs_double(double %a) { ; CHECK-NEXT: .reg .b64 %fd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [abs_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [abs_double_param_0]; ; CHECK-NEXT: abs.f64 %fd2, %fd1; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd2; ; CHECK-NEXT: ret; %b = call double @llvm.fabs.f64(double %a) ret double %b @@ -487,10 +487,10 @@ define float @minnum_float(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [minnum_float_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [minnum_float_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [minnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [minnum_float_param_1]; ; CHECK-NEXT: min.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float %a, float %b) ret float %x @@ -502,9 +502,9 @@ define float @minnum_imm1(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [minnum_imm1_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [minnum_imm1_param_0]; ; CHECK-NEXT: min.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float %a, float 0.0) ret float %x @@ -516,9 +516,9 @@ define float @minnum_imm2(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [minnum_imm2_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [minnum_imm2_param_0]; ; CHECK-NEXT: min.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float 0.0, float %a) ret float %x @@ -530,10 +530,10 @@ define float @minnum_float_ftz(float %a, float %b) #1 { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [minnum_float_ftz_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [minnum_float_ftz_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [minnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [minnum_float_ftz_param_1]; ; CHECK-NEXT: min.ftz.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.minnum.f32(float %a, float %b) ret float %x @@ -545,10 +545,10 @@ define double @minnum_double(double %a, double %b) { ; CHECK-NEXT: .reg .b64 %fd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [minnum_double_param_0]; -; CHECK-NEXT: ld.param.f64 %fd2, [minnum_double_param_1]; +; CHECK-NEXT: ld.param.b64 %fd1, [minnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd2, [minnum_double_param_1]; ; CHECK-NEXT: min.f64 %fd3, %fd1, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %x = call double @llvm.minnum.f64(double %a, double %b) ret double %x @@ -690,9 +690,9 @@ define float @minimum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: .reg .b32 %f<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [minimum_float_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [minimum_float_param_0]; ; CHECK-NOF16-NEXT: mov.b32 %r1, %f1; -; CHECK-NOF16-NEXT: ld.param.f32 %f2, [minimum_float_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %f2, [minimum_float_param_1]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f1, %f2; ; CHECK-NOF16-NEXT: min.f32 %f3, %f1, %f2; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f7FC00000, %f3, %p1; @@ -703,7 +703,7 @@ define float @minimum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_float( @@ -711,10 +711,10 @@ define float @minimum_float(float %a, float %b) { ; CHECK-F16-NEXT: .reg .b32 %f<4>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_float_param_0]; -; CHECK-F16-NEXT: ld.param.f32 %f2, [minimum_float_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [minimum_float_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f2, [minimum_float_param_1]; ; CHECK-F16-NEXT: min.NaN.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_float( @@ -722,10 +722,10 @@ define float @minimum_float(float %a, float %b) { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_float_param_0]; -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [minimum_float_param_1]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [minimum_float_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f2, [minimum_float_param_1]; ; CHECK-SM80-NOF16-NEXT: min.NaN.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float %a, float %b) ret float %x @@ -739,7 +739,7 @@ define float @minimum_imm1(float %a) { ; CHECK-NOF16-NEXT: .reg .b32 %f<6>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [minimum_imm1_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [minimum_imm1_param_0]; ; CHECK-NOF16-NEXT: mov.b32 %r1, %f1; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f1, %f1; ; CHECK-NOF16-NEXT: min.f32 %f2, %f1, 0f00000000; @@ -748,7 +748,7 @@ define float @minimum_imm1(float %a) { ; CHECK-NOF16-NEXT: selp.f32 %f4, %f1, %f3, %p2; ; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f5, %f4, %f3, %p3; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f5; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f5; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_imm1( @@ -756,9 +756,9 @@ define float @minimum_imm1(float %a) { ; CHECK-F16-NEXT: .reg .b32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_imm1_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [minimum_imm1_param_0]; ; CHECK-F16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_imm1( @@ -766,9 +766,9 @@ define float @minimum_imm1(float %a) { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<3>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_imm1_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [minimum_imm1_param_0]; ; CHECK-SM80-NOF16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float %a, float 0.0) ret float %x @@ -782,7 +782,7 @@ define float @minimum_imm2(float %a) { ; CHECK-NOF16-NEXT: .reg .b32 %f<6>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [minimum_imm2_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [minimum_imm2_param_0]; ; CHECK-NOF16-NEXT: mov.b32 %r1, %f1; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f1, %f1; ; CHECK-NOF16-NEXT: min.f32 %f2, %f1, 0f00000000; @@ -791,7 +791,7 @@ define float @minimum_imm2(float %a) { ; CHECK-NOF16-NEXT: selp.f32 %f4, %f1, %f3, %p2; ; CHECK-NOF16-NEXT: setp.eq.f32 %p3, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f5, %f4, %f3, %p3; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f5; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f5; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_imm2( @@ -799,9 +799,9 @@ define float @minimum_imm2(float %a) { ; CHECK-F16-NEXT: .reg .b32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_imm2_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [minimum_imm2_param_0]; ; CHECK-F16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_imm2( @@ -809,9 +809,9 @@ define float @minimum_imm2(float %a) { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<3>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_imm2_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [minimum_imm2_param_0]; ; CHECK-SM80-NOF16-NEXT: min.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float 0.0, float %a) ret float %x @@ -825,9 +825,9 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: .reg .b32 %f<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [minimum_float_ftz_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [minimum_float_ftz_param_0]; ; CHECK-NOF16-NEXT: mov.b32 %r1, %f1; -; CHECK-NOF16-NEXT: ld.param.f32 %f2, [minimum_float_ftz_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %f2, [minimum_float_ftz_param_1]; ; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %f1, %f2; ; CHECK-NOF16-NEXT: min.ftz.f32 %f3, %f1, %f2; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f7FC00000, %f3, %p1; @@ -838,7 +838,7 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: minimum_float_ftz( @@ -846,10 +846,10 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-F16-NEXT: .reg .b32 %f<4>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [minimum_float_ftz_param_0]; -; CHECK-F16-NEXT: ld.param.f32 %f2, [minimum_float_ftz_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [minimum_float_ftz_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f2, [minimum_float_ftz_param_1]; ; CHECK-F16-NEXT: min.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: minimum_float_ftz( @@ -857,10 +857,10 @@ define float @minimum_float_ftz(float %a, float %b) #1 { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [minimum_float_ftz_param_0]; -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [minimum_float_ftz_param_1]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [minimum_float_ftz_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f2, [minimum_float_ftz_param_1]; ; CHECK-SM80-NOF16-NEXT: min.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.minimum.f32(float %a, float %b) ret float %x @@ -874,9 +874,9 @@ define double @minimum_double(double %a, double %b) { ; CHECK-NEXT: .reg .b64 %fd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [minimum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [minimum_double_param_0]; ; CHECK-NEXT: mov.b64 %rd1, %fd1; -; CHECK-NEXT: ld.param.f64 %fd2, [minimum_double_param_1]; +; CHECK-NEXT: ld.param.b64 %fd2, [minimum_double_param_1]; ; CHECK-NEXT: setp.nan.f64 %p1, %fd1, %fd2; ; CHECK-NEXT: min.f64 %fd3, %fd1, %fd2; ; CHECK-NEXT: selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1; @@ -887,7 +887,7 @@ define double @minimum_double(double %a, double %b) { ; CHECK-NEXT: selp.f64 %fd6, %fd2, %fd5, %p3; ; CHECK-NEXT: setp.eq.f64 %p4, %fd4, 0d0000000000000000; ; CHECK-NEXT: selp.f64 %fd7, %fd6, %fd4, %p4; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd7; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd7; ; CHECK-NEXT: ret; %x = call double @llvm.minimum.f64(double %a, double %b) ret double %x @@ -1045,9 +1045,9 @@ define float @maxnum_imm1(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [maxnum_imm1_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [maxnum_imm1_param_0]; ; CHECK-NEXT: max.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float %a, float 0.0) ret float %x @@ -1059,9 +1059,9 @@ define float @maxnum_imm2(float %a) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [maxnum_imm2_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [maxnum_imm2_param_0]; ; CHECK-NEXT: max.f32 %f2, %f1, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float 0.0, float %a) ret float %x @@ -1073,10 +1073,10 @@ define float @maxnum_float(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [maxnum_float_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [maxnum_float_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [maxnum_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [maxnum_float_param_1]; ; CHECK-NEXT: max.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float %a, float %b) ret float %x @@ -1088,10 +1088,10 @@ define float @maxnum_float_ftz(float %a, float %b) #1 { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [maxnum_float_ftz_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [maxnum_float_ftz_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [maxnum_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [maxnum_float_ftz_param_1]; ; CHECK-NEXT: max.ftz.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %x = call float @llvm.maxnum.f32(float %a, float %b) ret float %x @@ -1103,10 +1103,10 @@ define double @maxnum_double(double %a, double %b) { ; CHECK-NEXT: .reg .b64 %fd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [maxnum_double_param_0]; -; CHECK-NEXT: ld.param.f64 %fd2, [maxnum_double_param_1]; +; CHECK-NEXT: ld.param.b64 %fd1, [maxnum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd2, [maxnum_double_param_1]; ; CHECK-NEXT: max.f64 %fd3, %fd1, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %x = call double @llvm.maxnum.f64(double %a, double %b) ret double %x @@ -1247,13 +1247,13 @@ define float @maximum_imm1(float %a) { ; CHECK-NOF16-NEXT: .reg .b32 %f<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [maximum_imm1_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [maximum_imm1_param_0]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f1, %f1; ; CHECK-NOF16-NEXT: max.f32 %f2, %f1, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f3, 0f7FC00000, %f2, %p1; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f00000000, %f3, %p2; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_imm1( @@ -1261,9 +1261,9 @@ define float @maximum_imm1(float %a) { ; CHECK-F16-NEXT: .reg .b32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_imm1_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [maximum_imm1_param_0]; ; CHECK-F16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_imm1( @@ -1271,9 +1271,9 @@ define float @maximum_imm1(float %a) { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<3>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_imm1_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [maximum_imm1_param_0]; ; CHECK-SM80-NOF16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float %a, float 0.0) ret float %x @@ -1286,13 +1286,13 @@ define float @maximum_imm2(float %a) { ; CHECK-NOF16-NEXT: .reg .b32 %f<5>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [maximum_imm2_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [maximum_imm2_param_0]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f1, %f1; ; CHECK-NOF16-NEXT: max.f32 %f2, %f1, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f3, 0f7FC00000, %f2, %p1; ; CHECK-NOF16-NEXT: setp.eq.f32 %p2, %f3, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f00000000, %f3, %p2; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_imm2( @@ -1300,9 +1300,9 @@ define float @maximum_imm2(float %a) { ; CHECK-F16-NEXT: .reg .b32 %f<3>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_imm2_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [maximum_imm2_param_0]; ; CHECK-F16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_imm2( @@ -1310,9 +1310,9 @@ define float @maximum_imm2(float %a) { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<3>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_imm2_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [maximum_imm2_param_0]; ; CHECK-SM80-NOF16-NEXT: max.NaN.f32 %f2, %f1, 0f00000000; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float 0.0, float %a) ret float %x @@ -1326,9 +1326,9 @@ define float @maximum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: .reg .b32 %f<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [maximum_float_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [maximum_float_param_0]; ; CHECK-NOF16-NEXT: mov.b32 %r1, %f1; -; CHECK-NOF16-NEXT: ld.param.f32 %f2, [maximum_float_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %f2, [maximum_float_param_1]; ; CHECK-NOF16-NEXT: setp.nan.f32 %p1, %f1, %f2; ; CHECK-NOF16-NEXT: max.f32 %f3, %f1, %f2; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f7FC00000, %f3, %p1; @@ -1339,7 +1339,7 @@ define float @maximum_float(float %a, float %b) { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_float( @@ -1347,10 +1347,10 @@ define float @maximum_float(float %a, float %b) { ; CHECK-F16-NEXT: .reg .b32 %f<4>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_float_param_0]; -; CHECK-F16-NEXT: ld.param.f32 %f2, [maximum_float_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [maximum_float_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f2, [maximum_float_param_1]; ; CHECK-F16-NEXT: max.NaN.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_float( @@ -1358,10 +1358,10 @@ define float @maximum_float(float %a, float %b) { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_float_param_0]; -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [maximum_float_param_1]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [maximum_float_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f2, [maximum_float_param_1]; ; CHECK-SM80-NOF16-NEXT: max.NaN.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float %a, float %b) ret float %x @@ -1375,9 +1375,9 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: .reg .b32 %f<8>; ; CHECK-NOF16-EMPTY: ; CHECK-NOF16-NEXT: // %bb.0: -; CHECK-NOF16-NEXT: ld.param.f32 %f1, [maximum_float_ftz_param_0]; +; CHECK-NOF16-NEXT: ld.param.b32 %f1, [maximum_float_ftz_param_0]; ; CHECK-NOF16-NEXT: mov.b32 %r1, %f1; -; CHECK-NOF16-NEXT: ld.param.f32 %f2, [maximum_float_ftz_param_1]; +; CHECK-NOF16-NEXT: ld.param.b32 %f2, [maximum_float_ftz_param_1]; ; CHECK-NOF16-NEXT: setp.nan.ftz.f32 %p1, %f1, %f2; ; CHECK-NOF16-NEXT: max.ftz.f32 %f3, %f1, %f2; ; CHECK-NOF16-NEXT: selp.f32 %f4, 0f7FC00000, %f3, %p1; @@ -1388,7 +1388,7 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-NOF16-NEXT: selp.f32 %f6, %f2, %f5, %p3; ; CHECK-NOF16-NEXT: setp.eq.ftz.f32 %p4, %f4, 0f00000000; ; CHECK-NOF16-NEXT: selp.f32 %f7, %f6, %f4, %p4; -; CHECK-NOF16-NEXT: st.param.f32 [func_retval0], %f7; +; CHECK-NOF16-NEXT: st.param.b32 [func_retval0], %f7; ; CHECK-NOF16-NEXT: ret; ; ; CHECK-F16-LABEL: maximum_float_ftz( @@ -1396,10 +1396,10 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-F16-NEXT: .reg .b32 %f<4>; ; CHECK-F16-EMPTY: ; CHECK-F16-NEXT: // %bb.0: -; CHECK-F16-NEXT: ld.param.f32 %f1, [maximum_float_ftz_param_0]; -; CHECK-F16-NEXT: ld.param.f32 %f2, [maximum_float_ftz_param_1]; +; CHECK-F16-NEXT: ld.param.b32 %f1, [maximum_float_ftz_param_0]; +; CHECK-F16-NEXT: ld.param.b32 %f2, [maximum_float_ftz_param_1]; ; CHECK-F16-NEXT: max.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-F16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-F16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-F16-NEXT: ret; ; ; CHECK-SM80-NOF16-LABEL: maximum_float_ftz( @@ -1407,10 +1407,10 @@ define float @maximum_float_ftz(float %a, float %b) #1 { ; CHECK-SM80-NOF16-NEXT: .reg .b32 %f<4>; ; CHECK-SM80-NOF16-EMPTY: ; CHECK-SM80-NOF16-NEXT: // %bb.0: -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f1, [maximum_float_ftz_param_0]; -; CHECK-SM80-NOF16-NEXT: ld.param.f32 %f2, [maximum_float_ftz_param_1]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f1, [maximum_float_ftz_param_0]; +; CHECK-SM80-NOF16-NEXT: ld.param.b32 %f2, [maximum_float_ftz_param_1]; ; CHECK-SM80-NOF16-NEXT: max.NaN.ftz.f32 %f3, %f1, %f2; -; CHECK-SM80-NOF16-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-SM80-NOF16-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-SM80-NOF16-NEXT: ret; %x = call float @llvm.maximum.f32(float %a, float %b) ret float %x @@ -1424,9 +1424,9 @@ define double @maximum_double(double %a, double %b) { ; CHECK-NEXT: .reg .b64 %fd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [maximum_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [maximum_double_param_0]; ; CHECK-NEXT: mov.b64 %rd1, %fd1; -; CHECK-NEXT: ld.param.f64 %fd2, [maximum_double_param_1]; +; CHECK-NEXT: ld.param.b64 %fd2, [maximum_double_param_1]; ; CHECK-NEXT: setp.nan.f64 %p1, %fd1, %fd2; ; CHECK-NEXT: max.f64 %fd3, %fd1, %fd2; ; CHECK-NEXT: selp.f64 %fd4, 0d7FF8000000000000, %fd3, %p1; @@ -1437,7 +1437,7 @@ define double @maximum_double(double %a, double %b) { ; CHECK-NEXT: selp.f64 %fd6, %fd2, %fd5, %p3; ; CHECK-NEXT: setp.eq.f64 %p4, %fd4, 0d0000000000000000; ; CHECK-NEXT: selp.f64 %fd7, %fd6, %fd4, %p4; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd7; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd7; ; CHECK-NEXT: ret; %x = call double @llvm.maximum.f64(double %a, double %b) ret double %x @@ -1550,11 +1550,11 @@ define float @fma_float(float %a, float %b, float %c) { ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fma_float_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [fma_float_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [fma_float_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [fma_float_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [fma_float_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [fma_float_param_2]; ; CHECK-NEXT: fma.rn.f32 %f4, %f1, %f2, %f3; -; CHECK-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NEXT: ret; %x = call float @llvm.fma.f32(float %a, float %b, float %c) ret float %x @@ -1566,11 +1566,11 @@ define float @fma_float_ftz(float %a, float %b, float %c) #1 { ; CHECK-NEXT: .reg .b32 %f<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [fma_float_ftz_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [fma_float_ftz_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [fma_float_ftz_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [fma_float_ftz_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [fma_float_ftz_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [fma_float_ftz_param_2]; ; CHECK-NEXT: fma.rn.ftz.f32 %f4, %f1, %f2, %f3; -; CHECK-NEXT: st.param.f32 [func_retval0], %f4; +; CHECK-NEXT: st.param.b32 [func_retval0], %f4; ; CHECK-NEXT: ret; %x = call float @llvm.fma.f32(float %a, float %b, float %c) ret float %x @@ -1582,11 +1582,11 @@ define double @fma_double(double %a, double %b, double %c) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [fma_double_param_0]; -; CHECK-NEXT: ld.param.f64 %fd2, [fma_double_param_1]; -; CHECK-NEXT: ld.param.f64 %fd3, [fma_double_param_2]; +; CHECK-NEXT: ld.param.b64 %fd1, [fma_double_param_0]; +; CHECK-NEXT: ld.param.b64 %fd2, [fma_double_param_1]; +; CHECK-NEXT: ld.param.b64 %fd3, [fma_double_param_2]; ; CHECK-NEXT: fma.rn.f64 %fd4, %fd1, %fd2, %fd3; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd4; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd4; ; CHECK-NEXT: ret; %x = call double @llvm.fma.f64(double %a, double %b, double %c) ret double %x diff --git a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll index b3abcc1a21d2..db8733da5b7e 100644 --- a/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll +++ b/llvm/test/CodeGen/NVPTX/misaligned-vector-ldst.ll @@ -8,8 +8,8 @@ target triple = "nvptx64-nvidia-cuda" define <4 x float> @t1(ptr %p1) { ; CHECK-NOT: ld.v4 ; CHECK-NOT: ld.v2 -; CHECK-NOT: ld.f32 -; CHECK: ld.u8 +; CHECK-NOT: ld.b32 +; CHECK: ld.b8 %r = load <4 x float>, ptr %p1, align 1 ret <4 x float> %r } @@ -18,7 +18,7 @@ define <4 x float> @t1(ptr %p1) { define <4 x float> @t2(ptr %p1) { ; CHECK-NOT: ld.v4 ; CHECK-NOT: ld.v2 -; CHECK: ld.f32 +; CHECK: ld.b32 %r = load <4 x float>, ptr %p1, align 4 ret <4 x float> %r } @@ -39,12 +39,12 @@ define <4 x float> @t4(ptr %p1) { } ; CHECK-LABEL: .visible .func test_v1halfp0a1( -; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0]; -; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1]; -; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] +; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0]; +; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1]; +; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.b8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.b8 [%[[TO]]+1], [[B1]] ; CHECK: ret define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) { %1 = load <1 x half>, ptr %from , align 1 @@ -53,16 +53,16 @@ define void @test_v1halfp0a1(ptr noalias readonly %from, ptr %to) { } ; CHECK-LABEL: .visible .func test_v2halfp0a1( -; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0]; -; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1]; -; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.u8 [%[[TO]]], -; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.u8 [%[[TO]]+1], -; CHECK-DAG: ld.u8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] -; CHECK-DAG: st.u8 [%[[TO]]+2], -; CHECK-DAG: ld.u8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] -; CHECK-DAG: st.u8 [%[[TO]]+3], +; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0]; +; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1]; +; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.b8 [%[[TO]]], +; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.b8 [%[[TO]]+1], +; CHECK-DAG: ld.b8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] +; CHECK-DAG: st.b8 [%[[TO]]+2], +; CHECK-DAG: ld.b8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] +; CHECK-DAG: st.b8 [%[[TO]]+3], ; CHECK: ret define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) { %1 = load <2 x half>, ptr %from , align 1 @@ -71,24 +71,24 @@ define void @test_v2halfp0a1(ptr noalias readonly %from, ptr %to) { } ; CHECK-LABEL: .visible .func test_v4halfp0a1( -; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0]; -; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1]; -; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] -; CHECK-DAG: ld.u8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] -; CHECK-DAG: st.u8 [%[[TO]]+2], [[B2]] -; CHECK-DAG: ld.u8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] -; CHECK-DAG: st.u8 [%[[TO]]+3], [[B3]] -; CHECK-DAG: ld.u8 [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4] -; CHECK-DAG: st.u8 [%[[TO]]+4], [[B4]] -; CHECK-DAG: ld.u8 [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5] -; CHECK-DAG: st.u8 [%[[TO]]+5], [[B5]] -; CHECK-DAG: ld.u8 [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6] -; CHECK-DAG: st.u8 [%[[TO]]+6], [[B6]] -; CHECK-DAG: ld.u8 [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7] -; CHECK-DAG: st.u8 [%[[TO]]+7], [[B7]] +; CHECK-DAG: ld.param.b64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0]; +; CHECK-DAG: ld.param.b64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1]; +; CHECK-DAG: ld.b8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.b8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.b8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.b8 [%[[TO]]+1], [[B1]] +; CHECK-DAG: ld.b8 [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2] +; CHECK-DAG: st.b8 [%[[TO]]+2], [[B2]] +; CHECK-DAG: ld.b8 [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3] +; CHECK-DAG: st.b8 [%[[TO]]+3], [[B3]] +; CHECK-DAG: ld.b8 [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4] +; CHECK-DAG: st.b8 [%[[TO]]+4], [[B4]] +; CHECK-DAG: ld.b8 [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5] +; CHECK-DAG: st.b8 [%[[TO]]+5], [[B5]] +; CHECK-DAG: ld.b8 [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6] +; CHECK-DAG: st.b8 [%[[TO]]+6], [[B6]] +; CHECK-DAG: ld.b8 [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7] +; CHECK-DAG: st.b8 [%[[TO]]+7], [[B7]] ; CHECK: ret define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { %1 = load <4 x half>, ptr %from , align 1 @@ -101,8 +101,8 @@ define void @test_v4halfp0a1(ptr noalias readonly %from, ptr %to) { define void @s1(ptr %p1, <4 x float> %v) { ; CHECK-NOT: st.v4 ; CHECK-NOT: st.v2 -; CHECK-NOT: st.f32 -; CHECK: st.u8 +; CHECK-NOT: st.b32 +; CHECK: st.b8 store <4 x float> %v, ptr %p1, align 1 ret void } @@ -111,7 +111,7 @@ define void @s1(ptr %p1, <4 x float> %v) { define void @s2(ptr %p1, <4 x float> %v) { ; CHECK-NOT: st.v4 ; CHECK-NOT: st.v2 -; CHECK: st.f32 +; CHECK: st.b32 store <4 x float> %v, ptr %p1, align 4 ret void } diff --git a/llvm/test/CodeGen/NVPTX/misched_func_call.ll b/llvm/test/CodeGen/NVPTX/misched_func_call.ll index fb4c653b709f..2e12c5041b06 100644 --- a/llvm/test/CodeGen/NVPTX/misched_func_call.ll +++ b/llvm/test/CodeGen/NVPTX/misched_func_call.ll @@ -12,22 +12,22 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: .reg .b64 %fd<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %bb -; CHECK-NEXT: ld.param.u32 %r4, [wombat_param_2]; -; CHECK-NEXT: ld.param.u32 %r3, [wombat_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [wombat_param_0]; +; CHECK-NEXT: ld.param.b32 %r4, [wombat_param_2]; +; CHECK-NEXT: ld.param.b32 %r3, [wombat_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [wombat_param_0]; ; CHECK-NEXT: mov.b32 %r10, 0; ; CHECK-NEXT: $L__BB0_1: // %bb3 ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.f64 [param0], 0d0000000000000000; +; CHECK-NEXT: st.param.b64 [param0], 0d0000000000000000; ; CHECK-NEXT: .param .b64 retval0; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: quux, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.f64 %fd1, [retval0]; +; CHECK-NEXT: ld.param.b64 %fd1, [retval0]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: mul.lo.s32 %r7, %r10, %r3; ; CHECK-NEXT: or.b32 %r8, %r4, %r7; @@ -36,7 +36,7 @@ define ptx_kernel void @wombat(i32 %arg, i32 %arg1, i32 %arg2) { ; CHECK-NEXT: cvt.rn.f64.u32 %fd4, %r10; ; CHECK-NEXT: add.rn.f64 %fd5, %fd4, %fd3; ; CHECK-NEXT: mov.b64 %rd1, 0; -; CHECK-NEXT: st.global.f64 [%rd1], %fd5; +; CHECK-NEXT: st.global.b64 [%rd1], %fd5; ; CHECK-NEXT: mov.b32 %r10, 1; ; CHECK-NEXT: bra.uni $L__BB0_1; bb: diff --git a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll index 21fce55fcbc2..8a88e1b26c7f 100644 --- a/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/mulhi-intrins.ll @@ -9,8 +9,8 @@ define i16 @test_mulhi_i16(i16 %x, i16 %y) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [test_mulhi_i16_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [test_mulhi_i16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_mulhi_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [test_mulhi_i16_param_1]; ; CHECK-NEXT: mul.hi.s16 %rs3, %rs1, %rs2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -26,8 +26,8 @@ define i16 @test_mulhi_u16(i16 %x, i16 %y) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [test_mulhi_u16_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [test_mulhi_u16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_mulhi_u16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [test_mulhi_u16_param_1]; ; CHECK-NEXT: mul.hi.u16 %rs3, %rs1, %rs2; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -42,8 +42,8 @@ define i32 @test_mulhi_i32(i32 %x, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_mulhi_i32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_mulhi_i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_mulhi_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_mulhi_i32_param_1]; ; CHECK-NEXT: mul.hi.s32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -57,8 +57,8 @@ define i32 @test_mulhi_u32(i32 %x, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_mulhi_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_mulhi_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_mulhi_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_mulhi_u32_param_1]; ; CHECK-NEXT: mul.hi.u32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -72,8 +72,8 @@ define i64 @test_mulhi_i64(i64 %x, i64 %y) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_mulhi_i64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_mulhi_i64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_mulhi_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_mulhi_i64_param_1]; ; CHECK-NEXT: mul.hi.s64 %rd3, %rd1, %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; @@ -87,8 +87,8 @@ define i64 @test_mulhi_u64(i64 %x, i64 %y) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_mulhi_u64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_mulhi_u64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_mulhi_u64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_mulhi_u64_param_1]; ; CHECK-NEXT: mul.hi.u64 %rd3, %rd1, %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd3; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/nounroll.ll b/llvm/test/CodeGen/NVPTX/nounroll.ll index f40c27ecd66b..e078570d4d43 100644 --- a/llvm/test/CodeGen/NVPTX/nounroll.ll +++ b/llvm/test/CodeGen/NVPTX/nounroll.ll @@ -20,15 +20,15 @@ for.body: %idxprom = sext i32 %i.06 to i64 %arrayidx = getelementptr inbounds float, ptr %input, i64 %idxprom %0 = load float, ptr %arrayidx, align 4 -; CHECK: ld.f32 +; CHECK: ld.b32 %arrayidx2 = getelementptr inbounds float, ptr %output, i64 %idxprom store float %0, ptr %arrayidx2, align 4 -; CHECK: st.f32 +; CHECK: st.b32 %inc = add nuw nsw i32 %i.06, 1 %exitcond = icmp eq i32 %inc, 2 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0 -; CHECK-NOT: ld.f32 -; CHECK-NOT: st.f32 +; CHECK-NOT: ld.b32 +; CHECK-NOT: st.b32 for.end: ret void @@ -50,15 +50,15 @@ for.body: %idxprom = sext i32 %i.06 to i64 %arrayidx = getelementptr inbounds float, ptr %input, i64 %idxprom %0 = load float, ptr %arrayidx, align 4 -; CHECK: ld.f32 +; CHECK: ld.b32 %arrayidx2 = getelementptr inbounds float, ptr %output, i64 %idxprom store float %0, ptr %arrayidx2, align 4 -; CHECK: st.f32 +; CHECK: st.b32 %inc = add nuw nsw i32 %i.06, 1 %exitcond = icmp eq i32 %inc, 2 br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 -; CHECK-NOT: ld.f32 -; CHECK-NOT: st.f32 +; CHECK-NOT: ld.b32 +; CHECK-NOT: st.b32 for.end: ret void diff --git a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll index 885c711d31f0..ff04e18701a8 100644 --- a/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll +++ b/llvm/test/CodeGen/NVPTX/nvvm-reflect-arch-O0.ll @@ -144,15 +144,15 @@ return: ; SM_52: .visible .func (.param .b32 func_retval0) phi() ; SM_52: mov.b32 %[[REG:.+]], 0f00000000; -; SM_52-NEXT: st.param.f32 [func_retval0], %[[REG]]; +; SM_52-NEXT: st.param.b32 [func_retval0], %[[REG]]; ; SM_52-NEXT: ret; ; SM_70: .visible .func (.param .b32 func_retval0) phi() ; SM_70: mov.b32 %[[REG:.+]], 0f00000000; -; SM_70-NEXT: st.param.f32 [func_retval0], %[[REG]]; +; SM_70-NEXT: st.param.b32 [func_retval0], %[[REG]]; ; SM_70-NEXT: ret; ; SM_90: .visible .func (.param .b32 func_retval0) phi() ; SM_90: mov.b32 %[[REG:.+]], 0f00000000; -; SM_90-NEXT: st.param.f32 [func_retval0], %[[REG]]; +; SM_90-NEXT: st.param.b32 [func_retval0], %[[REG]]; ; SM_90-NEXT: ret; define float @phi() { entry: diff --git a/llvm/test/CodeGen/NVPTX/param-add.ll b/llvm/test/CodeGen/NVPTX/param-add.ll index c8daf3b5760f..1840de449415 100644 --- a/llvm/test/CodeGen/NVPTX/param-add.ll +++ b/llvm/test/CodeGen/NVPTX/param-add.ll @@ -18,13 +18,13 @@ define i32 @test(%struct.1float alignstack(32) %data) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %r1, [test_param_0+1]; +; CHECK-NEXT: ld.param.b8 %r1, [test_param_0+1]; ; CHECK-NEXT: shl.b32 %r2, %r1, 8; -; CHECK-NEXT: ld.param.u8 %r3, [test_param_0]; +; CHECK-NEXT: ld.param.b8 %r3, [test_param_0]; ; CHECK-NEXT: or.b32 %r4, %r2, %r3; -; CHECK-NEXT: ld.param.u8 %r5, [test_param_0+3]; +; CHECK-NEXT: ld.param.b8 %r5, [test_param_0+3]; ; CHECK-NEXT: shl.b32 %r6, %r5, 8; -; CHECK-NEXT: ld.param.u8 %r7, [test_param_0+2]; +; CHECK-NEXT: ld.param.b8 %r7, [test_param_0+2]; ; CHECK-NEXT: or.b32 %r8, %r6, %r7; ; CHECK-NEXT: shl.b32 %r9, %r8, 16; ; CHECK-NEXT: or.b32 %r17, %r9, %r4; diff --git a/llvm/test/CodeGen/NVPTX/param-align.ll b/llvm/test/CodeGen/NVPTX/param-align.ll index 2adc5496d833..16220fb4d47b 100644 --- a/llvm/test/CodeGen/NVPTX/param-align.ll +++ b/llvm/test/CodeGen/NVPTX/param-align.ll @@ -71,14 +71,14 @@ define ptx_device void @t6() { } ; CHECK-LABEL: .func check_ptr_align1( -; CHECK: ld.param.u64 %rd1, [check_ptr_align1_param_0]; -; CHECK-NOT: ld.param.u8 +; CHECK: ld.param.b64 %rd1, [check_ptr_align1_param_0]; +; CHECK-NOT: ld.param.b8 ; CHECK: mov.b32 %r1, 0; -; CHECK: st.u8 [%rd1+3], %r1; -; CHECK: st.u8 [%rd1+2], %r1; -; CHECK: st.u8 [%rd1+1], %r1; +; CHECK: st.b8 [%rd1+3], %r1; +; CHECK: st.b8 [%rd1+2], %r1; +; CHECK: st.b8 [%rd1+1], %r1; ; CHECK: mov.b32 %r2, 1; -; CHECK: st.u8 [%rd1], %r2; +; CHECK: st.b8 [%rd1], %r2; ; CHECK: ret; define void @check_ptr_align1(ptr align 1 %_arg_ptr) { entry: @@ -87,12 +87,12 @@ entry: } ; CHECK-LABEL: .func check_ptr_align2( -; CHECK: ld.param.u64 %rd1, [check_ptr_align2_param_0]; -; CHECK-NOT: ld.param.u16 +; CHECK: ld.param.b64 %rd1, [check_ptr_align2_param_0]; +; CHECK-NOT: ld.param.b16 ; CHECK: mov.b32 %r1, 0; -; CHECK: st.u16 [%rd1+2], %r1; +; CHECK: st.b16 [%rd1+2], %r1; ; CHECK: mov.b32 %r2, 2; -; CHECK: st.u16 [%rd1], %r2; +; CHECK: st.b16 [%rd1], %r2; ; CHECK: ret; define void @check_ptr_align2(ptr align 2 %_arg_ptr) { entry: @@ -101,10 +101,10 @@ entry: } ; CHECK-LABEL: .func check_ptr_align4( -; CHECK: ld.param.u64 %rd1, [check_ptr_align4_param_0]; -; CHECK-NOT: ld.param.u32 +; CHECK: ld.param.b64 %rd1, [check_ptr_align4_param_0]; +; CHECK-NOT: ld.param.b32 ; CHECK: mov.b32 %r1, 4; -; CHECK: st.u32 [%rd1], %r1; +; CHECK: st.b32 [%rd1], %r1; ; CHECK: ret; define void @check_ptr_align4(ptr align 4 %_arg_ptr) { entry: @@ -113,9 +113,9 @@ entry: } ; CHECK-LABEL: .func check_ptr_align8( -; CHECK: ld.param.u64 %rd1, [check_ptr_align8_param_0]; +; CHECK: ld.param.b64 %rd1, [check_ptr_align8_param_0]; ; CHECK: mov.b32 %r1, 8; -; CHECK: st.u32 [%rd1], %r1; +; CHECK: st.b32 [%rd1], %r1; ; CHECK: ret; define void @check_ptr_align8(ptr align 8 %_arg_ptr) { entry: diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll index 2523fab17d55..781156082e54 100644 --- a/llvm/test/CodeGen/NVPTX/param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll @@ -24,7 +24,7 @@ ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i1( ; CHECK-NEXT: .param .b32 test_i1_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1_param_0]; +; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1_param_0]; ; CHECK: and.b16 [[A:%rs[0-9]+]], [[A8]], 1; ; CHECK: setp.ne.b16 %p1, [[A]], 0 ; CHECK: cvt.u32.u16 [[B:%r[0-9]+]], [[A8]] @@ -48,7 +48,7 @@ define i1 @test_i1(i1 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i1s( ; CHECK-NEXT: .param .b32 test_i1s_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; +; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; ; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; ; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; ; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; @@ -70,8 +70,8 @@ define signext i1 @test_i1s(i1 signext %a) { ; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_v3i1( ; CHECK-NEXT: .param .align 1 .b8 test_v3i1_param_0[1] -; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; -; CHECK-DAG: ld.param.u8 [[E0:%rs[0-9]+]], [test_v3i1_param_0] +; CHECK-DAG: ld.param.b8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; +; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v3i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; ; CHECK-DAG: st.param.b8 [param0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; @@ -91,7 +91,7 @@ define <3 x i1> @test_v3i1(<3 x i1> %a) { ; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_v4i1( ; CHECK-NEXT: .param .align 1 .b8 test_v4i1_param_0[1] -; CHECK: ld.param.u8 [[E0:%rs[0-9]+]], [test_v4i1_param_0] +; CHECK: ld.param.b8 [[E0:%rs[0-9]+]], [test_v4i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; ; CHECK: st.param.b8 [param0], [[E0]]; ; CHECK: .param .align 1 .b8 retval0[1]; @@ -114,8 +114,8 @@ define <4 x i1> @test_v4i1(<4 x i1> %a) { ; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_v5i1( ; CHECK-NEXT: .param .align 1 .b8 test_v5i1_param_0[1] -; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; -; CHECK-DAG: ld.param.u8 [[E0:%rs[0-9]+]], [test_v5i1_param_0] +; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; +; CHECK-DAG: ld.param.b8 [[E0:%rs[0-9]+]], [test_v5i1_param_0] ; CHECK: .param .align 1 .b8 param0[1]; ; CHECK-DAG: st.param.b8 [param0], [[E0]]; ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; @@ -135,7 +135,7 @@ define <5 x i1> @test_v5i1(<5 x i1> %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i2( ; CHECK-NEXT: .param .b32 test_i2_param_0 -; CHECK: ld.param.u8 {{%rs[0-9]+}}, [test_i2_param_0]; +; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i2_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; @@ -152,7 +152,7 @@ define i2 @test_i2(i2 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i3( ; CHECK-NEXT: .param .b32 test_i3_param_0 -; CHECK: ld.param.u8 {{%rs[0-9]+}}, [test_i3_param_0]; +; CHECK: ld.param.b8 {{%rs[0-9]+}}, [test_i3_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; @@ -170,7 +170,7 @@ define i3 @test_i3(i3 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i8( ; CHECK-NEXT: .param .b32 test_i8_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; +; CHECK: ld.param.b8 [[A8:%rs[0-9]+]], [test_i8_param_0]; ; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; ; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; ; CHECK: .param .b32 param0; @@ -212,7 +212,7 @@ define signext i8 @test_i8s(i8 signext %a) { ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) ; CHECK-LABEL: test_v3i8( ; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v3i8_param_0]; +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v3i8_param_0]; ; CHECK: .param .align 4 .b8 param0[4]; ; CHECK: st.param.b32 [param0], [[R]] ; CHECK: .param .align 4 .b8 retval0[4]; @@ -231,7 +231,7 @@ define <3 x i8> @test_v3i8(<3 x i8> %a) { ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) ; CHECK-LABEL: test_v4i8( ; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] -; CHECK: ld.param.u32 [[R:%r[0-9]+]], [test_v4i8_param_0] +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [test_v4i8_param_0] ; CHECK: .param .align 4 .b8 param0[4]; ; CHECK: st.param.b32 [param0], [[R]]; ; CHECK: .param .align 4 .b8 retval0[4]; @@ -248,8 +248,8 @@ define <4 x i8> @test_v4i8(<4 x i8> %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v5i8( ; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] -; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_v5i8_param_0] -; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; +; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_v5i8_param_0] +; CHECK-DAG: ld.param.b8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; ; CHECK: .param .align 8 .b8 param0[8]; ; CHECK-DAG: st.param.v4.b8 [param0], ; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; @@ -269,7 +269,7 @@ define <5 x i8> @test_v5i8(<5 x i8> %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i11( ; CHECK-NEXT: .param .b32 test_i11_param_0 -; CHECK: ld.param.u16 {{%rs[0-9]+}}, [test_i11_param_0]; +; CHECK: ld.param.b16 {{%rs[0-9]+}}, [test_i11_param_0]; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), @@ -285,7 +285,7 @@ define i11 @test_i11(i11 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i16( ; CHECK-NEXT: .param .b32 test_i16_param_0 -; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; +; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16_param_0]; ; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], [[E32]]; @@ -304,7 +304,7 @@ define i16 @test_i16(i16 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i16s( ; CHECK-NEXT: .param .b32 test_i16s_param_0 -; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; +; CHECK: ld.param.b16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; ; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], [[E32]]; @@ -323,8 +323,8 @@ define signext i16 @test_i16s(i16 signext %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v3i16( ; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] -; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; -; CHECK-DAG: ld.param.u32 [[R:%r[0-9]+]], [test_v3i16_param_0]; +; CHECK-DAG: ld.param.b16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; +; CHECK-DAG: ld.param.b32 [[R:%r[0-9]+]], [test_v3i16_param_0]; ; CHECK-DAG: mov.b32 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [[R]]; ; CHECK: .param .align 8 .b8 param0[8]; ; CHECK: st.param.v2.b16 [param0], {[[E0]], [[E1]]}; @@ -345,7 +345,7 @@ define <3 x i16> @test_v3i16(<3 x i16> %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v4i16( ; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0] +; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v4i16_param_0] ; CHECK: .param .align 8 .b8 param0[8]; ; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: .param .align 8 .b8 retval0[8]; @@ -362,8 +362,8 @@ define <4 x i16> @test_v4i16(<4 x i16> %a) { ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: test_v5i16( ; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] -; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; -; CHECK-DAG: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK-DAG: ld.param.b16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; +; CHECK-DAG: ld.param.v4.b16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK-DAG: st.param.v4.b16 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; @@ -474,7 +474,7 @@ define <3 x half> @test_v3f16(<3 x half> %a) { ; CHECK:.func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_v4f16( ; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] -; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; +; CHECK: ld.param.v2.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; ; CHECK: st.param.v2.b32 [param0], {[[R01]], [[R23]]}; ; CHECK: .param .align 8 .b8 retval0[8]; @@ -512,7 +512,7 @@ define <5 x half> @test_v5f16(<5 x half> %a) { ; CHECK:.func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: test_v8f16( ; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] -; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; +; CHECK: ld.param.v4.b32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK: st.param.v4.b32 [param0], {[[R01]], [[R23]], [[R45]], [[R67]]}; ; CHECK: .param .align 16 .b8 retval0[16]; @@ -554,8 +554,8 @@ define <9 x half> @test_v9f16(<9 x half> %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i19( ; CHECK-NEXT: .param .b32 test_i19_param_0 -; CHECK-DAG: ld.param.u16 {{%r[0-9]+}}, [test_i19_param_0]; -; CHECK-DAG: ld.param.u8 {{%r[0-9]+}}, [test_i19_param_0+2]; +; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i19_param_0]; +; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i19_param_0+2]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; @@ -572,8 +572,8 @@ define i19 @test_i19(i19 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i23( ; CHECK-NEXT: .param .b32 test_i23_param_0 -; CHECK-DAG: ld.param.u16 {{%r[0-9]+}}, [test_i23_param_0]; -; CHECK-DAG: ld.param.u8 {{%r[0-9]+}}, [test_i23_param_0+2]; +; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i23_param_0]; +; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i23_param_0+2]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; @@ -590,8 +590,8 @@ define i23 @test_i23(i23 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i24( ; CHECK-NEXT: .param .b32 test_i24_param_0 -; CHECK-DAG: ld.param.u8 {{%r[0-9]+}}, [test_i24_param_0+2]; -; CHECK-DAG: ld.param.u16 {{%r[0-9]+}}, [test_i24_param_0]; +; CHECK-DAG: ld.param.b8 {{%r[0-9]+}}, [test_i24_param_0+2]; +; CHECK-DAG: ld.param.b16 {{%r[0-9]+}}, [test_i24_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; @@ -608,7 +608,7 @@ define i24 @test_i24(i24 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i29( ; CHECK-NEXT: .param .b32 test_i29_param_0 -; CHECK: ld.param.u32 {{%r[0-9]+}}, [test_i29_param_0]; +; CHECK: ld.param.b32 {{%r[0-9]+}}, [test_i29_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], {{%r[0-9]+}}; ; CHECK: .param .b32 retval0; @@ -625,7 +625,7 @@ define i29 @test_i29(i29 %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_i32( ; CHECK-NEXT: .param .b32 test_i32_param_0 -; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; +; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_i32_param_0]; ; CHECK: .param .b32 param0; ; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .b32 retval0; @@ -642,8 +642,8 @@ define i32 @test_i32(i32 %a) { ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: test_v3i32( ; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; +; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b32 [param0+8], [[E2]]; @@ -663,7 +663,7 @@ define <3 x i32> @test_v3i32(<3 x i32> %a) { ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: test_v4i32( ; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] -; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] +; CHECK: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK: .param .align 16 .b8 retval0[16]; @@ -680,8 +680,8 @@ define <4 x i32> @test_v4i32(<4 x i32> %a) { ; CHECK: .func (.param .align 32 .b8 func_retval0[32]) ; CHECK-LABEL: test_v5i32( ; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] -; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; -; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] +; CHECK-DAG: ld.param.b32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; +; CHECK-DAG: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] ; CHECK: .param .align 32 .b8 param0[32]; ; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; @@ -701,14 +701,14 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) { ; CHECK: .func (.param .b32 func_retval0) ; CHECK-LABEL: test_f32( ; CHECK-NEXT: .param .b32 test_f32_param_0 -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; +; CHECK: ld.param.b32 [[E:%f[0-9]+]], [test_f32_param_0]; ; CHECK: .param .b32 param0; -; CHECK: st.param.f32 [param0], [[E]]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .b32 retval0; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0]; -; CHECK: st.param.f32 [func_retval0], [[R]]; +; CHECK: ld.param.b32 [[R:%f[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define float @test_f32(float %a) { %r = tail call float @test_f32(float %a); @@ -718,8 +718,8 @@ define float @test_f32(float %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i40( ; CHECK-NEXT: .param .b64 test_i40_param_0 -; CHECK-DAG: ld.param.u8 {{%rd[0-9]+}}, [test_i40_param_0+4]; -; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i40_param_0]; +; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i40_param_0+4]; +; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i40_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; @@ -736,8 +736,8 @@ define i40 @test_i40(i40 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i47( ; CHECK-NEXT: .param .b64 test_i47_param_0 -; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i47_param_0+4]; -; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i47_param_0]; +; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i47_param_0+4]; +; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i47_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; @@ -754,8 +754,8 @@ define i47 @test_i47(i47 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i48( ; CHECK-NEXT: .param .b64 test_i48_param_0 -; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i48_param_0+4]; -; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i48_param_0]; +; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i48_param_0+4]; +; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i48_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; @@ -772,9 +772,9 @@ define i48 @test_i48(i48 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i51( ; CHECK-NEXT: .param .b64 test_i51_param_0 -; CHECK-DAG: ld.param.u8 {{%rd[0-9]+}}, [test_i51_param_0+6]; -; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i51_param_0+4]; -; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i51_param_0]; +; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i51_param_0+6]; +; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i51_param_0+4]; +; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i51_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; @@ -791,9 +791,9 @@ define i51 @test_i51(i51 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i56( ; CHECK-NEXT: .param .b64 test_i56_param_0 -; CHECK-DAG: ld.param.u8 {{%rd[0-9]+}}, [test_i56_param_0+6]; -; CHECK-DAG: ld.param.u16 {{%rd[0-9]+}}, [test_i56_param_0+4]; -; CHECK-DAG: ld.param.u32 {{%rd[0-9]+}}, [test_i56_param_0]; +; CHECK-DAG: ld.param.b8 {{%rd[0-9]+}}, [test_i56_param_0+6]; +; CHECK-DAG: ld.param.b16 {{%rd[0-9]+}}, [test_i56_param_0+4]; +; CHECK-DAG: ld.param.b32 {{%rd[0-9]+}}, [test_i56_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; @@ -810,7 +810,7 @@ define i56 @test_i56(i56 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i57( ; CHECK-NEXT: .param .b64 test_i57_param_0 -; CHECK: ld.param.u64 {{%rd[0-9]+}}, [test_i57_param_0]; +; CHECK: ld.param.b64 {{%rd[0-9]+}}, [test_i57_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], {{%rd[0-9]+}}; ; CHECK: .param .b64 retval0; @@ -827,7 +827,7 @@ define i57 @test_i57(i57 %a) { ; CHECK: .func (.param .b64 func_retval0) ; CHECK-LABEL: test_i64( ; CHECK-NEXT: .param .b64 test_i64_param_0 -; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; +; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_i64_param_0]; ; CHECK: .param .b64 param0; ; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: .param .b64 retval0; @@ -844,8 +844,8 @@ define i64 @test_i64(i64 %a) { ; CHECK: .func (.param .align 32 .b8 func_retval0[32]) ; CHECK-LABEL: test_v3i64( ; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] -; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; +; CHECK-DAG: ld.param.b64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; +; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; ; CHECK: .param .align 32 .b8 param0[32]; ; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b64 [param0+16], [[E2]]; @@ -868,8 +868,8 @@ define <3 x i64> @test_v3i64(<3 x i64> %a) { ; CHECK: .func (.param .align 32 .b8 func_retval0[32]) ; CHECK-LABEL: test_v4i64( ; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] -; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; +; CHECK-DAG: ld.param.v2.b64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; +; CHECK-DAG: ld.param.v2.b64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; ; CHECK: .param .align 32 .b8 param0[32]; ; CHECK: st.param.v2.b64 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; @@ -891,7 +891,7 @@ define <4 x i64> @test_v4i64(<4 x i64> %a) { ; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_s_i1( ; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] -; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; +; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; ; CHECK: .param .align 1 .b8 param0[1]; ; CHECK: st.param.b8 [param0], [[A]] ; CHECK: .param .align 1 .b8 retval0[1]; @@ -908,7 +908,7 @@ define %s_i1 @test_s_i1(%s_i1 %a) { ; CHECK: .func (.param .align 1 .b8 func_retval0[1]) ; CHECK-LABEL: test_s_i8( ; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] -; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; +; CHECK: ld.param.b8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; ; CHECK: .param .align 1 .b8 param0[1]; ; CHECK: st.param.b8 [param0], [[A]] ; CHECK: .param .align 1 .b8 retval0[1]; @@ -925,7 +925,7 @@ define %s_i8 @test_s_i8(%s_i8 %a) { ; CHECK: .func (.param .align 2 .b8 func_retval0[2]) ; CHECK-LABEL: test_s_i16( ; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] -; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; +; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; ; CHECK: .param .align 2 .b8 param0[2]; ; CHECK: st.param.b16 [param0], [[A]] ; CHECK: .param .align 2 .b8 retval0[2]; @@ -959,7 +959,7 @@ define %s_f16 @test_s_f16(%s_f16 %a) { ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) ; CHECK-LABEL: test_s_i32( ; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] -; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; +; CHECK: ld.param.b32 [[E:%r[0-9]+]], [test_s_i32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] ; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; @@ -976,14 +976,14 @@ define %s_i32 @test_s_i32(%s_i32 %a) { ; CHECK: .func (.param .align 4 .b8 func_retval0[4]) ; CHECK-LABEL: test_s_f32( ; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; +; CHECK: ld.param.b32 [[E:%f[0-9]+]], [test_s_f32_param_0]; ; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.f32 [param0], [[E]]; +; CHECK: st.param.b32 [param0], [[E]]; ; CHECK: .param .align 4 .b8 retval0[4]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0]; -; CHECK: st.param.f32 [func_retval0], [[R]]; +; CHECK: ld.param.b32 [[R:%f[0-9]+]], [retval0]; +; CHECK: st.param.b32 [func_retval0], [[R]]; ; CHECK-NEXT: ret; define %s_f32 @test_s_f32(%s_f32 %a) { %r = tail call %s_f32 @test_s_f32(%s_f32 %a); @@ -993,7 +993,7 @@ define %s_f32 @test_s_f32(%s_f32 %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[8]) ; CHECK-LABEL: test_s_i64( ; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] -; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; +; CHECK: ld.param.b64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; ; CHECK: .param .align 8 .b8 param0[8]; ; CHECK: st.param.b64 [param0], [[E]]; ; CHECK: .param .align 8 .b8 retval0[8]; @@ -1011,29 +1011,29 @@ define %s_i64 @test_s_i64(%s_i64 %a) { ; CHECK: .func (.param .align 8 .b8 func_retval0[24]) ; CHECK-LABEL: test_s_i32f32( ; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] -; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; -; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; -; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; -; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; +; CHECK-DAG: ld.param.b64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; +; CHECK-DAG: ld.param.b32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; +; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; +; CHECK-DAG: ld.param.b32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; +; CHECK-DAG: ld.param.b32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; ; CHECK: .param .align 8 .b8 param0[24]; ; CHECK-DAG: st.param.b32 [param0], [[E0]]; -; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; +; CHECK-DAG: st.param.b32 [param0+4], [[E1]]; ; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; -; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b32 [param0+12], [[E3]]; ; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; ; CHECK: .param .align 8 .b8 retval0[24]; ; CHECK: call.uni (retval0), ; CHECK-NEXT: test_s_i32f32, ; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0]; -; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[RE1:%f[0-9]+]], [retval0+4]; ; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b32 [[RE3:%f[0-9]+]], [retval0+12]; ; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; ; CHECK-DAG: st.param.b32 [func_retval0], [[RE0]]; -; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; +; CHECK-DAG: st.param.b32 [func_retval0+4], [[RE1]]; ; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; +; CHECK-DAG: st.param.b32 [func_retval0+12], [[RE3]]; ; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; ; CHECK: ret; define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { @@ -1045,9 +1045,9 @@ define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { ; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) ; CHECK-LABEL: test_s_i32x4( ; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] -; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; -; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; +; CHECK-DAG: ld.param.b64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; +; CHECK-DAG: ld.param.v2.b32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; ; CHECK: .param .align 8 .b8 param0[24]; ; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; @@ -1071,11 +1071,11 @@ define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { ; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) ; CHECK-LABEL: test_s_i1i32x4( ; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] -; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; -; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; -; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; -; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; +; CHECK: ld.param.b64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; +; CHECK: ld.param.b32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; +; CHECK: ld.param.b32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; +; CHECK: ld.param.b8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; +; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; ; CHECK: .param .align 8 .b8 param0[32]; ; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b8 [param0+8], [[E2]]; @@ -1110,31 +1110,31 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { ; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) ; CHECK-LABEL: test_s_i1i32x4p( ; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; +; CHECK-DAG: ld.param.b8 %r{{.*}}, [test_s_i1i32x4p_param_0]; ; CHECK: .param .align 1 .b8 param0[25]; ; CHECK-DAG: st.param.b8 [param0], ; CHECK-DAG: st.param.b8 [param0+1], @@ -1225,12 +1225,12 @@ define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { ; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) ; CHECK-LABEL: test_s_crossfield( ; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] -; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; -; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; -; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; -; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; -; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; +; CHECK: ld.param.b32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; +; CHECK: ld.param.v4.b32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; +; CHECK: ld.param.v4.b32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; +; CHECK: ld.param.v4.b32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; +; CHECK: ld.param.b32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; +; CHECK: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; ; CHECK: .param .align 16 .b8 param0[80]; ; CHECK: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK: st.param.b32 [param0+8], [[E2]]; diff --git a/llvm/test/CodeGen/NVPTX/param-overalign.ll b/llvm/test/CodeGen/NVPTX/param-overalign.ll index 374475a29ffa..e3d611865f1f 100644 --- a/llvm/test/CodeGen/NVPTX/param-overalign.ll +++ b/llvm/test/CodeGen/NVPTX/param-overalign.ll @@ -24,20 +24,20 @@ define float @caller_md(float %a, float %b) { ; CHECK-NEXT: ) ; CHECK-NEXT: { -; CHECK: ld.param.f32 %f1, [caller_md_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [caller_md_param_1]; +; CHECK: ld.param.b32 %f1, [caller_md_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [caller_md_param_1]; ; CHECK-NEXT: { ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%f1, %f2}; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: callee_md, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.f32 %f3, [retval0]; +; CHECK-NEXT: ld.param.b32 %f3, [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %s1 = insertvalue %struct.float2 poison, float %a, 0 %s2 = insertvalue %struct.float2 %s1, float %b, 1 @@ -51,9 +51,9 @@ define float @callee_md(%struct.float2 alignstack(8) %a) { ; CHECK-NEXT: ) ; CHECK-NEXT: { -; CHECK: ld.param.v2.f32 {%f1, %f2}, [callee_md_param_0]; +; CHECK: ld.param.v2.b32 {%f1, %f2}, [callee_md_param_0]; ; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %v0 = extractvalue %struct.float2 %a, 0 %v1 = extractvalue %struct.float2 %a, 1 @@ -68,20 +68,20 @@ define float @caller(float %a, float %b) { ; CHECK-NEXT: ) ; CHECK-NEXT: { -; CHECK: ld.param.f32 %f1, [caller_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [caller_param_1]; +; CHECK: ld.param.b32 %f1, [caller_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [caller_param_1]; ; CHECK-NEXT: { ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, %f2}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%f1, %f2}; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: call.uni (retval0), ; CHECK-NEXT: callee, ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.f32 %f3, [retval0]; +; CHECK-NEXT: ld.param.b32 %f3, [retval0]; ; CHECK-NEXT: } -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %s1 = insertvalue %struct.float2 poison, float %a, 0 %s2 = insertvalue %struct.float2 %s1, float %b, 1 @@ -95,9 +95,9 @@ define float @callee(%struct.float2 alignstack(8) %a ) { ; CHECK-NEXT: ) ; CHECK-NEXT: { -; CHECK: ld.param.v2.f32 {%f1, %f2}, [callee_param_0]; +; CHECK: ld.param.v2.b32 {%f1, %f2}, [callee_param_0]; ; CHECK-NEXT: add.rn.f32 %f3, %f1, %f2; -; CHECK-NEXT: st.param.f32 [func_retval0], %f3; +; CHECK-NEXT: st.param.b32 [func_retval0], %f3; ; CHECK-NEXT: ret; %v0 = extractvalue %struct.float2 %a, 0 %v1 = extractvalue %struct.float2 %a, 1 diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll index db8b1a6f53d1..abb1aff86775 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-device.ll @@ -103,7 +103,7 @@ define internal fastcc [1 x i32] @callee_St4x1(i32 %in.0.val) { ; CHECK: .func (.param .align 16 .b8 func_retval0[4]) ; CHECK-LABEL: callee_St4x1( ; CHECK-NEXT: .param .b32 callee_St4x1_param_0 - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x1_param_0]; + ; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [callee_St4x1_param_0]; ; CHECK: st.param.b32 [func_retval0], [[R1]]; ; CHECK-NEXT: ret; %oldret = insertvalue [1 x i32] poison, i32 %in.0.val, 0 @@ -140,7 +140,7 @@ define internal fastcc [2 x i32] @callee_St4x2(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) ; CHECK-LABEL: callee_St4x2( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x2_param_0[8] - ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0]; + ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x2_param_0]; ; CHECK: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -183,8 +183,8 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[12]) ; CHECK-LABEL: callee_St4x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12] - ; CHECK: ld.param.v2.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; + ; CHECK: ld.param.v2.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]]}, [callee_St4x3_param_0]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x3_param_0+8]; ; CHECK: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]}; ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; ; CHECK-NEXT: ret; @@ -232,7 +232,7 @@ define internal fastcc [4 x i32] @callee_St4x4(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: callee_St4x4( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_param_0[16] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_param_0]; ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -287,8 +287,8 @@ define internal fastcc [5 x i32] @callee_St4x5(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[20]) ; CHECK-LABEL: callee_St4x5( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x5_param_0[20] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x5_param_0]; + ; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [callee_St4x5_param_0+16]; ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.b32 [func_retval0+16], [[R5]]; ; CHECK-NEXT: ret; @@ -350,8 +350,8 @@ define internal fastcc [6 x i32] @callee_St4x6(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) ; CHECK-LABEL: callee_St4x6( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x6_param_0[24] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0]; - ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x6_param_0]; + ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x6_param_0+16]; ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK-NEXT: ret; @@ -421,9 +421,9 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[28]) ; CHECK-LABEL: callee_St4x7( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; - ; CHECK: ld.param.v2.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0]; + ; CHECK: ld.param.v2.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]]}, [callee_St4x7_param_0+16]; + ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [callee_St4x7_param_0+24]; ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]}; ; CHECK: st.param.b32 [func_retval0+24], [[R7]]; @@ -498,8 +498,8 @@ define internal fastcc [8 x i32] @callee_St4x8(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) ; CHECK-LABEL: callee_St4x8( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x8_param_0[32] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0]; - ; CHECK: ld.param.v4.u32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x8_param_0]; + ; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], [[R8:%r[0-9]+]]}, [callee_St4x8_param_0+16]; ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK: st.param.v4.b32 [func_retval0+16], {[[R5]], [[R6]], [[R7]], [[R8]]}; ; CHECK-NEXT: ret; @@ -554,7 +554,7 @@ define internal fastcc [1 x i64] @callee_St8x1(i64 %in.0.val) { ; CHECK: .func (.param .align 16 .b8 func_retval0[8]) ; CHECK-LABEL: callee_St8x1( ; CHECK-NEXT: .param .b64 callee_St8x1_param_0 - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0]; + ; CHECK: ld.param.b64 [[RD1:%rd[0-9]+]], [callee_St8x1_param_0]; ; CHECK: st.param.b64 [func_retval0], [[RD1]]; ; CHECK-NEXT: ret; %oldret = insertvalue [1 x i64] poison, i64 %in.0.val, 0 @@ -588,7 +588,7 @@ define internal fastcc [2 x i64] @callee_St8x2(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: callee_St8x2( ; CHECK-NEXT: .param .align 16 .b8 callee_St8x2_param_0[16] - ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0]; + ; CHECK: ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x2_param_0]; ; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]}; ; CHECK-NEXT: ret; %1 = load i64, ptr %in, align 8 @@ -631,8 +631,8 @@ define internal fastcc [3 x i64] @callee_St8x3(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[24]) ; CHECK-LABEL: callee_St8x3( ; CHECK-NEXT: .param .align 16 .b8 callee_St8x3_param_0[24] - ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0]; - ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16]; + ; CHECK: ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x3_param_0]; + ; CHECK: ld.param.b64 [[RD3:%rd[0-9]+]], [callee_St8x3_param_0+16]; ; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]}; ; CHECK: st.param.b64 [func_retval0+16], [[RD3]]; ; CHECK-NEXT: ret; @@ -682,8 +682,8 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by ; CHECK: .func (.param .align 16 .b8 func_retval0[32]) ; CHECK-LABEL: callee_St8x4( ; CHECK-NEXT: .param .align 16 .b8 callee_St8x4_param_0[32] - ; CHECK: ld.param.v2.u64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0]; - ; CHECK: ld.param.v2.u64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16]; + ; CHECK: ld.param.v2.b64 {[[RD1:%rd[0-9]+]], [[RD2:%rd[0-9]+]]}, [callee_St8x4_param_0]; + ; CHECK: ld.param.v2.b64 {[[RD3:%rd[0-9]+]], [[RD4:%rd[0-9]+]]}, [callee_St8x4_param_0+16]; ; CHECK: st.param.v2.b64 [func_retval0], {[[RD1]], [[RD2]]}; ; CHECK: st.param.v2.b64 [func_retval0+16], {[[RD3]], [[RD4]]}; ; CHECK-NEXT: ret; @@ -707,7 +707,7 @@ define private fastcc [4 x i32] @callee_St4x4_private(ptr nocapture noundef read ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) ; CHECK-LABEL: callee_St4x4_private( ; CHECK-NEXT: .param .align 16 .b8 callee_St4x4_private_param_0[16] - ; CHECK: ld.param.v4.u32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0]; + ; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x4_private_param_0]; ; CHECK: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]}; ; CHECK-NEXT: ret; %1 = load i32, ptr %in, align 4 @@ -731,10 +731,10 @@ define external fastcc [4 x i32] @callee_St4x4_external(ptr nocapture noundef re ; CHECK: .func (.param .align 4 .b8 func_retval0[16]) ; CHECK-LABEL: callee_St4x4_external( ; CHECK-NEXT: .param .align 4 .b8 callee_St4x4_external_param_0[16] - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [callee_St4x4_external_param_0]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12]; + ; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [callee_St4x4_external_param_0]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [callee_St4x4_external_param_0+4]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [callee_St4x4_external_param_0+8]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [callee_St4x4_external_param_0+12]; ; CHECK: st.param.b32 [func_retval0], [[R1]]; ; CHECK: st.param.b32 [func_retval0+4], [[R2]]; ; CHECK: st.param.b32 [func_retval0+8], [[R3]]; diff --git a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll index 9ca1eddee9d7..410653805c98 100644 --- a/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll +++ b/llvm/test/CodeGen/NVPTX/param-vectorize-kernel.ll @@ -65,9 +65,9 @@ define dso_local void @foo_St4x1(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x1_param_0[4], ; CHECK: .param .b64 foo_St4x1_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x1_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x1_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x1_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x1_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -79,11 +79,11 @@ define dso_local void @foo_St4x2(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x2_param_0[8], ; CHECK: .param .b64 foo_St4x2_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x2_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x2_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x2_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x2_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x2_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -99,13 +99,13 @@ define dso_local void @foo_St4x3(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x3_param_0[12], ; CHECK: .param .b64 foo_St4x3_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x3_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x3_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x3_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x3_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x3_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x3_param_0+8]; + ; CHECK: st.b32 [[[R1]]+8], [[R4]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -125,15 +125,15 @@ define dso_local void @foo_St4x4(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x4_param_0[16], ; CHECK: .param .b64 foo_St4x4_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x4_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x4_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x4_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x4_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x4_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x4_param_0+8]; + ; CHECK: st.b32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x4_param_0+12]; + ; CHECK: st.b32 [[[R1]]+12], [[R5]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -157,17 +157,17 @@ define dso_local void @foo_St4x5(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x5_param_0[20], ; CHECK: .param .b64 foo_St4x5_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x5_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x5_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x5_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x5_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x5_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x5_param_0+8]; + ; CHECK: st.b32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x5_param_0+12]; + ; CHECK: st.b32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x5_param_0+16]; + ; CHECK: st.b32 [[[R1]]+16], [[R6]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -195,19 +195,19 @@ define dso_local void @foo_St4x6(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x6_param_0[24], ; CHECK: .param .b64 foo_St4x6_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x6_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x6_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20]; - ; CHECK: st.u32 [[[R1]]+20], [[R7]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x6_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x6_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x6_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x6_param_0+8]; + ; CHECK: st.b32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x6_param_0+12]; + ; CHECK: st.b32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x6_param_0+16]; + ; CHECK: st.b32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [foo_St4x6_param_0+20]; + ; CHECK: st.b32 [[[R1]]+20], [[R7]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -239,21 +239,21 @@ define dso_local void @foo_St4x7(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x7_param_0[28], ; CHECK: .param .b64 foo_St4x7_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x7_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x7_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20]; - ; CHECK: st.u32 [[[R1]]+20], [[R7]]; - ; CHECK: ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24]; - ; CHECK: st.u32 [[[R1]]+24], [[R8]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x7_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x7_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x7_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x7_param_0+8]; + ; CHECK: st.b32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x7_param_0+12]; + ; CHECK: st.b32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x7_param_0+16]; + ; CHECK: st.b32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [foo_St4x7_param_0+20]; + ; CHECK: st.b32 [[[R1]]+20], [[R7]]; + ; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [foo_St4x7_param_0+24]; + ; CHECK: st.b32 [[[R1]]+24], [[R8]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -289,23 +289,23 @@ define dso_local void @foo_St4x8(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 4 .b8 foo_St4x8_param_0[32], ; CHECK: .param .b64 foo_St4x8_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St4x8_param_1]; - ; CHECK: ld.param.u32 [[R2:%r[0-9]+]], [foo_St4x8_param_0]; - ; CHECK: st.u32 [[[R1]]], [[R2]]; - ; CHECK: ld.param.u32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4]; - ; CHECK: st.u32 [[[R1]]+4], [[R3]]; - ; CHECK: ld.param.u32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8]; - ; CHECK: st.u32 [[[R1]]+8], [[R4]]; - ; CHECK: ld.param.u32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12]; - ; CHECK: st.u32 [[[R1]]+12], [[R5]]; - ; CHECK: ld.param.u32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16]; - ; CHECK: st.u32 [[[R1]]+16], [[R6]]; - ; CHECK: ld.param.u32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20]; - ; CHECK: st.u32 [[[R1]]+20], [[R7]]; - ; CHECK: ld.param.u32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24]; - ; CHECK: st.u32 [[[R1]]+24], [[R8]]; - ; CHECK: ld.param.u32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28]; - ; CHECK: st.u32 [[[R1]]+28], [[R9]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St4x8_param_1]; + ; CHECK: ld.param.b32 [[R2:%r[0-9]+]], [foo_St4x8_param_0]; + ; CHECK: st.b32 [[[R1]]], [[R2]]; + ; CHECK: ld.param.b32 [[R3:%r[0-9]+]], [foo_St4x8_param_0+4]; + ; CHECK: st.b32 [[[R1]]+4], [[R3]]; + ; CHECK: ld.param.b32 [[R4:%r[0-9]+]], [foo_St4x8_param_0+8]; + ; CHECK: st.b32 [[[R1]]+8], [[R4]]; + ; CHECK: ld.param.b32 [[R5:%r[0-9]+]], [foo_St4x8_param_0+12]; + ; CHECK: st.b32 [[[R1]]+12], [[R5]]; + ; CHECK: ld.param.b32 [[R6:%r[0-9]+]], [foo_St4x8_param_0+16]; + ; CHECK: st.b32 [[[R1]]+16], [[R6]]; + ; CHECK: ld.param.b32 [[R7:%r[0-9]+]], [foo_St4x8_param_0+20]; + ; CHECK: st.b32 [[[R1]]+20], [[R7]]; + ; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [foo_St4x8_param_0+24]; + ; CHECK: st.b32 [[[R1]]+24], [[R8]]; + ; CHECK: ld.param.b32 [[R9:%r[0-9]+]], [foo_St4x8_param_0+28]; + ; CHECK: st.b32 [[[R1]]+28], [[R9]]; ; CHECK: ret; %1 = load i32, ptr %in, align 4 store i32 %1, ptr %ret, align 4 @@ -345,9 +345,9 @@ define dso_local void @foo_St8x1(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 8 .b8 foo_St8x1_param_0[8], ; CHECK: .param .b64 foo_St8x1_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x1_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x1_param_1]; + ; CHECK: ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x1_param_0]; + ; CHECK: st.b64 [[[R1]]], [[RD1]]; ; CHECK: ret; %1 = load i64, ptr %in, align 8 store i64 %1, ptr %ret, align 8 @@ -359,11 +359,11 @@ define dso_local void @foo_St8x2(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 8 .b8 foo_St8x2_param_0[16], ; CHECK: .param .b64 foo_St8x2_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x2_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8]; - ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x2_param_1]; + ; CHECK: ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x2_param_0]; + ; CHECK: st.b64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.b64 [[RD2:%rd[0-9]+]], [foo_St8x2_param_0+8]; + ; CHECK: st.b64 [[[R1]]+8], [[RD2]]; ; CHECK: ret; %1 = load i64, ptr %in, align 8 store i64 %1, ptr %ret, align 8 @@ -379,13 +379,13 @@ define dso_local void @foo_St8x3(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 8 .b8 foo_St8x3_param_0[24], ; CHECK: .param .b64 foo_St8x3_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x3_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8]; - ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; - ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16]; - ; CHECK: st.u64 [[[R1]]+16], [[RD3]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x3_param_1]; + ; CHECK: ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x3_param_0]; + ; CHECK: st.b64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.b64 [[RD2:%rd[0-9]+]], [foo_St8x3_param_0+8]; + ; CHECK: st.b64 [[[R1]]+8], [[RD2]]; + ; CHECK: ld.param.b64 [[RD3:%rd[0-9]+]], [foo_St8x3_param_0+16]; + ; CHECK: st.b64 [[[R1]]+16], [[RD3]]; ; CHECK: ret; %1 = load i64, ptr %in, align 8 store i64 %1, ptr %ret, align 8 @@ -405,15 +405,15 @@ define dso_local void @foo_St8x4(ptr nocapture noundef readonly byval(%struct.St ; CHECK: .param .align 8 .b8 foo_St8x4_param_0[32], ; CHECK: .param .b64 foo_St8x4_param_1 ; CHECK: ) - ; CHECK: ld.param.u64 [[R1:%rd[0-9]+]], [foo_St8x4_param_1]; - ; CHECK: ld.param.u64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0]; - ; CHECK: st.u64 [[[R1]]], [[RD1]]; - ; CHECK: ld.param.u64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8]; - ; CHECK: st.u64 [[[R1]]+8], [[RD2]]; - ; CHECK: ld.param.u64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16]; - ; CHECK: st.u64 [[[R1]]+16], [[RD3]]; - ; CHECK: ld.param.u64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24]; - ; CHECK: st.u64 [[[R1]]+24], [[RD4]]; + ; CHECK: ld.param.b64 [[R1:%rd[0-9]+]], [foo_St8x4_param_1]; + ; CHECK: ld.param.b64 [[RD1:%rd[0-9]+]], [foo_St8x4_param_0]; + ; CHECK: st.b64 [[[R1]]], [[RD1]]; + ; CHECK: ld.param.b64 [[RD2:%rd[0-9]+]], [foo_St8x4_param_0+8]; + ; CHECK: st.b64 [[[R1]]+8], [[RD2]]; + ; CHECK: ld.param.b64 [[RD3:%rd[0-9]+]], [foo_St8x4_param_0+16]; + ; CHECK: st.b64 [[[R1]]+16], [[RD3]]; + ; CHECK: ld.param.b64 [[RD4:%rd[0-9]+]], [foo_St8x4_param_0+24]; + ; CHECK: st.b64 [[[R1]]+24], [[RD4]]; ; CHECK: ret; %1 = load i64, ptr %in, align 8 store i64 %1, ptr %ret, align 8 diff --git a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll index 191d4711e803..b6f1964c54c7 100644 --- a/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll +++ b/llvm/test/CodeGen/NVPTX/pr13291-i1-store.ll @@ -5,19 +5,19 @@ define ptx_kernel void @t1(ptr %a) { ; PTX32: mov.b16 %rs{{[0-9]+}}, 0; -; PTX32-NEXT: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX32-NEXT: st.global.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}}; ; PTX64: mov.b16 %rs{{[0-9]+}}, 0; -; PTX64-NEXT: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; +; PTX64-NEXT: st.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}}; store i1 false, ptr %a ret void } define ptx_kernel void @t2(ptr %a, ptr %b) { -; PTX32: ld.global.u8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] +; PTX32: ld.global.b8 %rs{{[0-9]+}}, [%r{{[0-9]+}}] ; PTX32: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX32: setp.ne.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 0; -; PTX64: ld.global.u8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] +; PTX64: ld.global.b8 %rs{{[0-9]+}}, [%rd{{[0-9]+}}] ; PTX64: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, 1; ; PTX64: setp.ne.b16 %p{{[0-9]+}}, %rs{{[0-9]+}}, 0; diff --git a/llvm/test/CodeGen/NVPTX/pr16278.ll b/llvm/test/CodeGen/NVPTX/pr16278.ll index ad832dcde35a..508786be0303 100644 --- a/llvm/test/CodeGen/NVPTX/pr16278.ll +++ b/llvm/test/CodeGen/NVPTX/pr16278.ll @@ -4,7 +4,7 @@ @one_f = addrspace(4) global float 1.000000e+00, align 4 define float @foo() { -; CHECK: ld.const.f32 +; CHECK: ld.const.b32 %val = load float, ptr addrspace(4) @one_f ret float %val } diff --git a/llvm/test/CodeGen/NVPTX/prefetch.ll b/llvm/test/CodeGen/NVPTX/prefetch.ll index 68512bfac7a2..a64e4fe7a508 100644 --- a/llvm/test/CodeGen/NVPTX/prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/prefetch.ll @@ -23,7 +23,7 @@ define void @prefetch_local(ptr addrspace(5) %local_ptr) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_local_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_local_param_0]; ; CHECK-PTX64-NEXT: prefetch.local.L1 [%rd1]; ; CHECK-PTX64-NEXT: prefetch.local.L2 [%rd1]; ; CHECK-PTX64-NEXT: ret; @@ -38,7 +38,7 @@ define void @prefetch_global(ptr addrspace(1) %global_ptr) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch_global_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_global_param_0]; ; CHECK-PTX64-NEXT: prefetch.global.L1 [%rd1]; ; CHECK-PTX64-NEXT: prefetch.global.L2 [%rd1]; ; CHECK-PTX64-NEXT: prefetch.global.L2::evict_normal [%rd1]; @@ -58,7 +58,7 @@ define void @prefetch_(ptr %ptr) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetch__param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch__param_0]; ; CHECK-PTX64-NEXT: prefetch.L1 [%rd1]; ; CHECK-PTX64-NEXT: prefetch.L2 [%rd1]; ; CHECK-PTX64-NEXT: ret; @@ -73,7 +73,7 @@ define void @prefetchu_l1(ptr %ptr) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [prefetchu_l1_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetchu_l1_param_0]; ; CHECK-PTX64-NEXT: prefetchu.L1 [%rd1]; ; CHECK-PTX64-NEXT: ret; tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr) diff --git a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll index b95a3287474c..c39716bef4d7 100644 --- a/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll +++ b/llvm/test/CodeGen/NVPTX/proxy-reg-erasure-ptx.ll @@ -107,12 +107,12 @@ declare float @callee_f32() define float @check_f32() { ; PTX-LABEL: check_f32 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.f32 [[LD:%f[0-9]+]], [retval0]; + ; PTX-DAG: ld.param.b32 [[LD:%f[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b32 [[PROXY:%f[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.f32 [func_retval0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.f32 [func_retval0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.b32 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.b32 [func_retval0], [[LD]]; %ret = call float @callee_f32() ret float %ret @@ -122,12 +122,12 @@ declare double @callee_f64() define double @check_f64() { ; PTX-LABEL: check_f64 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.f64 [[LD:%fd[0-9]+]], [retval0]; + ; PTX-DAG: ld.param.b64 [[LD:%fd[0-9]+]], [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b64 [[PROXY:%fd[0-9]+]], [[LD]]; - ; PTX-WITHOUT-DAG: st.param.f64 [func_retval0], [[PROXY]]; - ; PTX-WITH-DAG: st.param.f64 [func_retval0], [[LD]]; + ; PTX-WITHOUT-DAG: st.param.b64 [func_retval0], [[PROXY]]; + ; PTX-WITH-DAG: st.param.b64 [func_retval0], [[LD]]; %ret = call double @callee_f64() ret double %ret @@ -170,13 +170,13 @@ declare <2 x double> @callee_vec_f64() define <2 x double> @check_vec_f64() { ; PTX-LABEL: check_vec_f64 ; PTX-DAG: { // callseq {{[0-9]+}}, {{[0-9]+}} - ; PTX-DAG: ld.param.v2.f64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0]; + ; PTX-DAG: ld.param.v2.b64 {[[LD0:%fd[0-9]+]], [[LD1:%fd[0-9]+]]}, [retval0]; ; PTX-DAG: } // callseq {{[0-9]+}} ; PTX-WITHOUT-DAG: mov.b64 [[PROXY0:%fd[0-9]+]], [[LD0]]; ; PTX-WITHOUT-DAG: mov.b64 [[PROXY1:%fd[0-9]+]], [[LD1]]; - ; PTX-WITHOUT-DAG: st.param.v2.f64 [func_retval0], {[[PROXY0]], [[PROXY1]]}; - ; PTX-WITH-DAG: st.param.v2.f64 [func_retval0], {[[LD0]], [[LD1]]}; + ; PTX-WITHOUT-DAG: st.param.v2.b64 [func_retval0], {[[PROXY0]], [[PROXY1]]}; + ; PTX-WITH-DAG: st.param.v2.b64 [func_retval0], {[[LD0]], [[LD1]]}; %ret = call <2 x double> @callee_vec_f64() ret <2 x double> %ret diff --git a/llvm/test/CodeGen/NVPTX/rcp-opt.ll b/llvm/test/CodeGen/NVPTX/rcp-opt.ll index 0b020b775138..e0ef5baf21bf 100644 --- a/llvm/test/CodeGen/NVPTX/rcp-opt.ll +++ b/llvm/test/CodeGen/NVPTX/rcp-opt.ll @@ -12,10 +12,10 @@ define double @test1(double %in) { ; CHECK-NEXT: .reg .b64 %fd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [test1_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [test1_param_0]; ; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %div = fdiv double 1.000000e+00, %in %neg = fsub double -0.000000e+00, %div @@ -30,10 +30,10 @@ define double @test2(double %in) { ; CHECK-NEXT: .reg .b64 %fd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [test2_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [test2_param_0]; ; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %div = fdiv double -1.000000e+00, %in ret double %div @@ -47,10 +47,10 @@ define double @test3(double %in) { ; CHECK-NEXT: .reg .b64 %fd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [test3_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [test3_param_0]; ; CHECK-NEXT: rcp.rn.f64 %fd2, %fd1; ; CHECK-NEXT: neg.f64 %fd3, %fd2; -; CHECK-NEXT: st.param.f64 [func_retval0], %fd3; +; CHECK-NEXT: st.param.b64 [func_retval0], %fd3; ; CHECK-NEXT: ret; %neg = fsub double -0.000000e+00, %in %div = fdiv double 1.000000e+00, %neg diff --git a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll index 1d14be9070b0..aa463b510fe8 100644 --- a/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll +++ b/llvm/test/CodeGen/NVPTX/read-global-variable-constant.ll @@ -10,21 +10,21 @@ ; CHECK-LABEL: test_gv_float() define float @test_gv_float() { -; CHECK: ld.global.nc.f32 +; CHECK: ld.global.nc.b32 %v = load float, ptr @gv_float ret float %v } ; CHECK-LABEL: test_gv_float2() define <2 x float> @test_gv_float2() { -; CHECK: ld.global.nc.v2.f32 +; CHECK: ld.global.nc.v2.b32 %v = load <2 x float>, ptr @gv_float2 ret <2 x float> %v } ; CHECK-LABEL: test_gv_float4() define <4 x float> @test_gv_float4() { -; CHECK: ld.global.nc.v4.f32 +; CHECK: ld.global.nc.v4.b32 %v = load <4 x float>, ptr @gv_float4 ret <4 x float> %v } diff --git a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll index 020a61a1675a..180b90ff90a7 100644 --- a/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll +++ b/llvm/test/CodeGen/NVPTX/reduction-intrinsics.ll @@ -22,7 +22,7 @@ define half @reduce_fadd_half(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: mov.b16 %rs3, 0x0000; ; CHECK-NEXT: add.rn.f16 %rs4, %rs1, %rs3; @@ -49,7 +49,7 @@ define half @reduce_fadd_half_reassoc(<8 x half> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<10>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; ; CHECK-SM80-NEXT: add.rn.f16x2 %r5, %r2, %r4; ; CHECK-SM80-NEXT: add.rn.f16x2 %r6, %r1, %r3; ; CHECK-SM80-NEXT: add.rn.f16x2 %r7, %r6, %r5; @@ -69,7 +69,7 @@ define half @reduce_fadd_half_reassoc(<8 x half> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<10>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fadd_half_reassoc_param_0]; ; CHECK-SM100-NEXT: add.rn.f16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.rn.f16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.rn.f16x2 %r7, %r6, %r5; @@ -118,8 +118,8 @@ define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<17>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_param_0]; ; CHECK-NEXT: add.rn.f32 %f9, %f1, 0f00000000; ; CHECK-NEXT: add.rn.f32 %f10, %f9, %f2; ; CHECK-NEXT: add.rn.f32 %f11, %f10, %f3; @@ -128,7 +128,7 @@ define float @reduce_fadd_float(<8 x float> %in) { ; CHECK-NEXT: add.rn.f32 %f14, %f13, %f6; ; CHECK-NEXT: add.rn.f32 %f15, %f14, %f7; ; CHECK-NEXT: add.rn.f32 %f16, %f15, %f8; -; CHECK-NEXT: st.param.f32 [func_retval0], %f16; +; CHECK-NEXT: st.param.b32 [func_retval0], %f16; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res @@ -140,8 +140,8 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<17>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fadd_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_param_0]; ; CHECK-NEXT: add.rn.f32 %f9, %f3, %f7; ; CHECK-NEXT: add.rn.f32 %f10, %f1, %f5; ; CHECK-NEXT: add.rn.f32 %f11, %f4, %f8; @@ -150,7 +150,7 @@ define float @reduce_fadd_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: add.rn.f32 %f14, %f10, %f9; ; CHECK-NEXT: add.rn.f32 %f15, %f14, %f13; ; CHECK-NEXT: add.rn.f32 %f16, %f15, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0], %f16; +; CHECK-NEXT: st.param.b32 [func_retval0], %f16; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <8 x float> %in) ret float %res @@ -162,9 +162,9 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %f7, [reduce_fadd_float_reassoc_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%f5, %f6}, [reduce_fadd_float_reassoc_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fadd_float_reassoc_nonpow2_param_0]; ; CHECK-NEXT: add.rn.f32 %f8, %f3, %f7; ; CHECK-NEXT: add.rn.f32 %f9, %f1, %f5; ; CHECK-NEXT: add.rn.f32 %f10, %f9, %f8; @@ -172,7 +172,7 @@ define float @reduce_fadd_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: add.rn.f32 %f12, %f11, %f4; ; CHECK-NEXT: add.rn.f32 %f13, %f10, %f12; ; CHECK-NEXT: add.rn.f32 %f14, %f13, 0f00000000; -; CHECK-NEXT: st.param.f32 [func_retval0], %f14; +; CHECK-NEXT: st.param.b32 [func_retval0], %f14; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fadd(float 0.0, <7 x float> %in) ret float %res @@ -186,7 +186,7 @@ define half @reduce_fmul_half(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: mul.rn.f16 %rs3, %rs1, %rs2; ; CHECK-NEXT: mov.b32 {%rs4, %rs5}, %r2; @@ -211,7 +211,7 @@ define half @reduce_fmul_half_reassoc(<8 x half> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<10>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; ; CHECK-SM80-NEXT: mul.rn.f16x2 %r5, %r2, %r4; ; CHECK-SM80-NEXT: mul.rn.f16x2 %r6, %r1, %r3; ; CHECK-SM80-NEXT: mul.rn.f16x2 %r7, %r6, %r5; @@ -229,7 +229,7 @@ define half @reduce_fmul_half_reassoc(<8 x half> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<10>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmul_half_reassoc_param_0]; ; CHECK-SM100-NEXT: mul.rn.f16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: mul.rn.f16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: mul.rn.f16x2 %r7, %r6, %r5; @@ -277,8 +277,8 @@ define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_param_0]; ; CHECK-NEXT: mul.rn.f32 %f9, %f1, %f2; ; CHECK-NEXT: mul.rn.f32 %f10, %f9, %f3; ; CHECK-NEXT: mul.rn.f32 %f11, %f10, %f4; @@ -286,7 +286,7 @@ define float @reduce_fmul_float(<8 x float> %in) { ; CHECK-NEXT: mul.rn.f32 %f13, %f12, %f6; ; CHECK-NEXT: mul.rn.f32 %f14, %f13, %f7; ; CHECK-NEXT: mul.rn.f32 %f15, %f14, %f8; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res @@ -298,8 +298,8 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmul_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_param_0]; ; CHECK-NEXT: mul.rn.f32 %f9, %f3, %f7; ; CHECK-NEXT: mul.rn.f32 %f10, %f1, %f5; ; CHECK-NEXT: mul.rn.f32 %f11, %f4, %f8; @@ -307,7 +307,7 @@ define float @reduce_fmul_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: mul.rn.f32 %f13, %f12, %f11; ; CHECK-NEXT: mul.rn.f32 %f14, %f10, %f9; ; CHECK-NEXT: mul.rn.f32 %f15, %f14, %f13; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <8 x float> %in) ret float %res @@ -319,16 +319,16 @@ define float @reduce_fmul_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %f7, [reduce_fmul_float_reassoc_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%f5, %f6}, [reduce_fmul_float_reassoc_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmul_float_reassoc_nonpow2_param_0]; ; CHECK-NEXT: mul.rn.f32 %f8, %f3, %f7; ; CHECK-NEXT: mul.rn.f32 %f9, %f1, %f5; ; CHECK-NEXT: mul.rn.f32 %f10, %f9, %f8; ; CHECK-NEXT: mul.rn.f32 %f11, %f2, %f6; ; CHECK-NEXT: mul.rn.f32 %f12, %f11, %f4; ; CHECK-NEXT: mul.rn.f32 %f13, %f10, %f12; -; CHECK-NEXT: st.param.f32 [func_retval0], %f13; +; CHECK-NEXT: st.param.b32 [func_retval0], %f13; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmul(float 1.0, <7 x float> %in) ret float %res @@ -342,7 +342,7 @@ define half @reduce_fmax_half(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_param_0]; ; CHECK-NEXT: max.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: max.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: max.f16x2 %r7, %r6, %r5; @@ -361,7 +361,7 @@ define half @reduce_fmax_half_reassoc(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmax_half_reassoc_param_0]; ; CHECK-NEXT: max.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: max.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: max.f16x2 %r7, %r6, %r5; @@ -407,8 +407,8 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_param_0]; ; CHECK-NEXT: max.f32 %f9, %f4, %f8; ; CHECK-NEXT: max.f32 %f10, %f2, %f6; ; CHECK-NEXT: max.f32 %f11, %f10, %f9; @@ -416,7 +416,7 @@ define float @reduce_fmax_float(<8 x float> %in) { ; CHECK-NEXT: max.f32 %f13, %f1, %f5; ; CHECK-NEXT: max.f32 %f14, %f13, %f12; ; CHECK-NEXT: max.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmax(<8 x float> %in) ret float %res @@ -429,8 +429,8 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmax_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_param_0]; ; CHECK-NEXT: max.f32 %f9, %f4, %f8; ; CHECK-NEXT: max.f32 %f10, %f2, %f6; ; CHECK-NEXT: max.f32 %f11, %f10, %f9; @@ -438,7 +438,7 @@ define float @reduce_fmax_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: max.f32 %f13, %f1, %f5; ; CHECK-NEXT: max.f32 %f14, %f13, %f12; ; CHECK-NEXT: max.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmax(<8 x float> %in) ret float %res @@ -451,16 +451,16 @@ define float @reduce_fmax_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %f7, [reduce_fmax_float_reassoc_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%f5, %f6}, [reduce_fmax_float_reassoc_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmax_float_reassoc_nonpow2_param_0]; ; CHECK-NEXT: max.f32 %f8, %f3, %f7; ; CHECK-NEXT: max.f32 %f9, %f1, %f5; ; CHECK-NEXT: max.f32 %f10, %f9, %f8; ; CHECK-NEXT: max.f32 %f11, %f2, %f6; ; CHECK-NEXT: max.f32 %f12, %f11, %f4; ; CHECK-NEXT: max.f32 %f13, %f10, %f12; -; CHECK-NEXT: st.param.f32 [func_retval0], %f13; +; CHECK-NEXT: st.param.b32 [func_retval0], %f13; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmax(<7 x float> %in) ret float %res @@ -474,7 +474,7 @@ define half @reduce_fmin_half(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_param_0]; ; CHECK-NEXT: min.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: min.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: min.f16x2 %r7, %r6, %r5; @@ -493,7 +493,7 @@ define half @reduce_fmin_half_reassoc(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmin_half_reassoc_param_0]; ; CHECK-NEXT: min.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: min.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: min.f16x2 %r7, %r6, %r5; @@ -539,8 +539,8 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_param_0]; ; CHECK-NEXT: min.f32 %f9, %f4, %f8; ; CHECK-NEXT: min.f32 %f10, %f2, %f6; ; CHECK-NEXT: min.f32 %f11, %f10, %f9; @@ -548,7 +548,7 @@ define float @reduce_fmin_float(<8 x float> %in) { ; CHECK-NEXT: min.f32 %f13, %f1, %f5; ; CHECK-NEXT: min.f32 %f14, %f13, %f12; ; CHECK-NEXT: min.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmin(<8 x float> %in) ret float %res @@ -561,8 +561,8 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmin_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_param_0]; ; CHECK-NEXT: min.f32 %f9, %f4, %f8; ; CHECK-NEXT: min.f32 %f10, %f2, %f6; ; CHECK-NEXT: min.f32 %f11, %f10, %f9; @@ -570,7 +570,7 @@ define float @reduce_fmin_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: min.f32 %f13, %f1, %f5; ; CHECK-NEXT: min.f32 %f14, %f13, %f12; ; CHECK-NEXT: min.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmin(<8 x float> %in) ret float %res @@ -583,16 +583,16 @@ define float @reduce_fmin_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %f7, [reduce_fmin_float_reassoc_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%f5, %f6}, [reduce_fmin_float_reassoc_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmin_float_reassoc_nonpow2_param_0]; ; CHECK-NEXT: min.f32 %f8, %f3, %f7; ; CHECK-NEXT: min.f32 %f9, %f1, %f5; ; CHECK-NEXT: min.f32 %f10, %f9, %f8; ; CHECK-NEXT: min.f32 %f11, %f2, %f6; ; CHECK-NEXT: min.f32 %f12, %f11, %f4; ; CHECK-NEXT: min.f32 %f13, %f10, %f12; -; CHECK-NEXT: st.param.f32 [func_retval0], %f13; +; CHECK-NEXT: st.param.b32 [func_retval0], %f13; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmin(<7 x float> %in) ret float %res @@ -606,7 +606,7 @@ define half @reduce_fmaximum_half(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_param_0]; ; CHECK-NEXT: max.NaN.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: max.NaN.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: max.NaN.f16x2 %r7, %r6, %r5; @@ -625,7 +625,7 @@ define half @reduce_fmaximum_half_reassoc(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fmaximum_half_reassoc_param_0]; ; CHECK-NEXT: max.NaN.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: max.NaN.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: max.NaN.f16x2 %r7, %r6, %r5; @@ -671,8 +671,8 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_param_0]; ; CHECK-NEXT: max.NaN.f32 %f9, %f4, %f8; ; CHECK-NEXT: max.NaN.f32 %f10, %f2, %f6; ; CHECK-NEXT: max.NaN.f32 %f11, %f10, %f9; @@ -680,7 +680,7 @@ define float @reduce_fmaximum_float(<8 x float> %in) { ; CHECK-NEXT: max.NaN.f32 %f13, %f1, %f5; ; CHECK-NEXT: max.NaN.f32 %f14, %f13, %f12; ; CHECK-NEXT: max.NaN.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fmaximum(<8 x float> %in) ret float %res @@ -693,8 +693,8 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fmaximum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_param_0]; ; CHECK-NEXT: max.NaN.f32 %f9, %f4, %f8; ; CHECK-NEXT: max.NaN.f32 %f10, %f2, %f6; ; CHECK-NEXT: max.NaN.f32 %f11, %f10, %f9; @@ -702,7 +702,7 @@ define float @reduce_fmaximum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: max.NaN.f32 %f13, %f1, %f5; ; CHECK-NEXT: max.NaN.f32 %f14, %f13, %f12; ; CHECK-NEXT: max.NaN.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmaximum(<8 x float> %in) ret float %res @@ -715,16 +715,16 @@ define float @reduce_fmaximum_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %f7, [reduce_fmaximum_float_reassoc_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%f5, %f6}, [reduce_fmaximum_float_reassoc_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fmaximum_float_reassoc_nonpow2_param_0]; ; CHECK-NEXT: max.NaN.f32 %f8, %f3, %f7; ; CHECK-NEXT: max.NaN.f32 %f9, %f1, %f5; ; CHECK-NEXT: max.NaN.f32 %f10, %f9, %f8; ; CHECK-NEXT: max.NaN.f32 %f11, %f2, %f6; ; CHECK-NEXT: max.NaN.f32 %f12, %f11, %f4; ; CHECK-NEXT: max.NaN.f32 %f13, %f10, %f12; -; CHECK-NEXT: st.param.f32 [func_retval0], %f13; +; CHECK-NEXT: st.param.b32 [func_retval0], %f13; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fmaximum(<7 x float> %in) ret float %res @@ -738,7 +738,7 @@ define half @reduce_fminimum_half(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_param_0]; ; CHECK-NEXT: min.NaN.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: min.NaN.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: min.NaN.f16x2 %r7, %r6, %r5; @@ -757,7 +757,7 @@ define half @reduce_fminimum_half_reassoc(<8 x half> %in) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_fminimum_half_reassoc_param_0]; ; CHECK-NEXT: min.NaN.f16x2 %r5, %r2, %r4; ; CHECK-NEXT: min.NaN.f16x2 %r6, %r1, %r3; ; CHECK-NEXT: min.NaN.f16x2 %r7, %r6, %r5; @@ -803,8 +803,8 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_param_0]; ; CHECK-NEXT: min.NaN.f32 %f9, %f4, %f8; ; CHECK-NEXT: min.NaN.f32 %f10, %f2, %f6; ; CHECK-NEXT: min.NaN.f32 %f11, %f10, %f9; @@ -812,7 +812,7 @@ define float @reduce_fminimum_float(<8 x float> %in) { ; CHECK-NEXT: min.NaN.f32 %f13, %f1, %f5; ; CHECK-NEXT: min.NaN.f32 %f14, %f13, %f12; ; CHECK-NEXT: min.NaN.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call float @llvm.vector.reduce.fminimum(<8 x float> %in) ret float %res @@ -825,8 +825,8 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.f32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%f5, %f6, %f7, %f8}, [reduce_fminimum_float_reassoc_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_param_0]; ; CHECK-NEXT: min.NaN.f32 %f9, %f4, %f8; ; CHECK-NEXT: min.NaN.f32 %f10, %f2, %f6; ; CHECK-NEXT: min.NaN.f32 %f11, %f10, %f9; @@ -834,7 +834,7 @@ define float @reduce_fminimum_float_reassoc(<8 x float> %in) { ; CHECK-NEXT: min.NaN.f32 %f13, %f1, %f5; ; CHECK-NEXT: min.NaN.f32 %f14, %f13, %f12; ; CHECK-NEXT: min.NaN.f32 %f15, %f14, %f11; -; CHECK-NEXT: st.param.f32 [func_retval0], %f15; +; CHECK-NEXT: st.param.b32 [func_retval0], %f15; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fminimum(<8 x float> %in) ret float %res @@ -847,16 +847,16 @@ define float @reduce_fminimum_float_reassoc_nonpow2(<7 x float> %in) { ; CHECK-NEXT: .reg .b32 %f<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.f32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.f32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %f7, [reduce_fminimum_float_reassoc_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%f5, %f6}, [reduce_fminimum_float_reassoc_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%f1, %f2, %f3, %f4}, [reduce_fminimum_float_reassoc_nonpow2_param_0]; ; CHECK-NEXT: min.NaN.f32 %f8, %f3, %f7; ; CHECK-NEXT: min.NaN.f32 %f9, %f1, %f5; ; CHECK-NEXT: min.NaN.f32 %f10, %f9, %f8; ; CHECK-NEXT: min.NaN.f32 %f11, %f2, %f6; ; CHECK-NEXT: min.NaN.f32 %f12, %f11, %f4; ; CHECK-NEXT: min.NaN.f32 %f13, %f10, %f12; -; CHECK-NEXT: st.param.f32 [func_retval0], %f13; +; CHECK-NEXT: st.param.b32 [func_retval0], %f13; ; CHECK-NEXT: ret; %res = call reassoc float @llvm.vector.reduce.fminimum(<7 x float> %in) ret float %res @@ -869,7 +869,7 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<6>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; ; CHECK-SM80-NEXT: add.s16 %rs5, %rs3, %rs1; @@ -891,7 +891,7 @@ define i16 @reduce_add_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i16_param_0]; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: add.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: add.s16x2 %r7, %r6, %r5; @@ -914,10 +914,10 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<3>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.u32 %r1, [reduce_add_i16_nonpow2_param_0+8]; +; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8]; ; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM80-NEXT: ld.param.u16 %rs7, [reduce_add_i16_nonpow2_param_0+12]; -; CHECK-SM80-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0]; +; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12]; +; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0]; ; CHECK-SM80-NEXT: add.s16 %rs8, %rs3, %rs7; ; CHECK-SM80-NEXT: add.s16 %rs9, %rs1, %rs5; ; CHECK-SM80-NEXT: add.s16 %rs10, %rs9, %rs8; @@ -934,12 +934,12 @@ define i16 @reduce_add_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.u32 %r1, [reduce_add_i16_nonpow2_param_0+8]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_add_i16_nonpow2_param_0+8]; ; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM100-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_add_i16_nonpow2_param_0]; ; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-SM100-NEXT: ld.param.u16 %rs7, [reduce_add_i16_nonpow2_param_0+12]; +; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_add_i16_nonpow2_param_0+12]; ; CHECK-SM100-NEXT: mov.b16 %rs8, 0; ; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-SM100-NEXT: add.s16x2 %r5, %r3, %r4; @@ -960,8 +960,8 @@ define i32 @reduce_add_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_add_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_param_0]; ; CHECK-NEXT: add.s32 %r9, %r3, %r7; ; CHECK-NEXT: add.s32 %r10, %r1, %r5; ; CHECK-NEXT: add.s32 %r11, %r4, %r8; @@ -981,9 +981,9 @@ define i32 @reduce_add_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_add_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_add_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_add_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_add_i32_nonpow2_param_0]; ; CHECK-NEXT: add.s32 %r8, %r3, %r7; ; CHECK-NEXT: add.s32 %r9, %r1, %r5; ; CHECK-NEXT: add.s32 %r10, %r9, %r8; @@ -1003,7 +1003,7 @@ define i16 @reduce_mul_i16(<8 x i16> %in) { ; CHECK-NEXT: .reg .b32 %r<6>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i16_param_0]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-NEXT: mov.b32 {%rs3, %rs4}, %r2; ; CHECK-NEXT: mul.lo.s16 %rs5, %rs3, %rs1; @@ -1029,10 +1029,10 @@ define i16 @reduce_mul_i16_nonpow2(<7 x i16> %in) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [reduce_mul_i16_nonpow2_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r1, [reduce_mul_i16_nonpow2_param_0+8]; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: ld.param.u16 %rs7, [reduce_mul_i16_nonpow2_param_0+12]; -; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b16 %rs7, [reduce_mul_i16_nonpow2_param_0+12]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_mul_i16_nonpow2_param_0]; ; CHECK-NEXT: mul.lo.s16 %rs8, %rs3, %rs7; ; CHECK-NEXT: mul.lo.s16 %rs9, %rs1, %rs5; ; CHECK-NEXT: mul.lo.s16 %rs10, %rs9, %rs8; @@ -1052,8 +1052,8 @@ define i32 @reduce_mul_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_mul_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_param_0]; ; CHECK-NEXT: mul.lo.s32 %r9, %r3, %r7; ; CHECK-NEXT: mul.lo.s32 %r10, %r1, %r5; ; CHECK-NEXT: mul.lo.s32 %r11, %r4, %r8; @@ -1073,9 +1073,9 @@ define i32 @reduce_mul_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_mul_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_mul_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_mul_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_mul_i32_nonpow2_param_0]; ; CHECK-NEXT: mul.lo.s32 %r8, %r3, %r7; ; CHECK-NEXT: mul.lo.s32 %r9, %r1, %r5; ; CHECK-NEXT: mul.lo.s32 %r10, %r9, %r8; @@ -1095,7 +1095,7 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<6>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; ; CHECK-SM80-NEXT: max.u16 %rs5, %rs3, %rs1; @@ -1117,7 +1117,7 @@ define i16 @reduce_umax_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i16_param_0]; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.u16x2 %r7, %r6, %r5; @@ -1140,10 +1140,10 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<3>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.u32 %r1, [reduce_umax_i16_nonpow2_param_0+8]; +; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8]; ; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM80-NEXT: ld.param.u16 %rs7, [reduce_umax_i16_nonpow2_param_0+12]; -; CHECK-SM80-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0]; +; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12]; +; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0]; ; CHECK-SM80-NEXT: max.u16 %rs8, %rs3, %rs7; ; CHECK-SM80-NEXT: max.u16 %rs9, %rs1, %rs5; ; CHECK-SM80-NEXT: max.u16 %rs10, %rs9, %rs8; @@ -1160,12 +1160,12 @@ define i16 @reduce_umax_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.u32 %r1, [reduce_umax_i16_nonpow2_param_0+8]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_umax_i16_nonpow2_param_0+8]; ; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM100-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umax_i16_nonpow2_param_0]; ; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-SM100-NEXT: ld.param.u16 %rs7, [reduce_umax_i16_nonpow2_param_0+12]; +; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_umax_i16_nonpow2_param_0+12]; ; CHECK-SM100-NEXT: mov.b16 %rs8, 0; ; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-SM100-NEXT: max.u16x2 %r5, %r3, %r4; @@ -1186,8 +1186,8 @@ define i32 @reduce_umax_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umax_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_param_0]; ; CHECK-NEXT: max.u32 %r9, %r3, %r7; ; CHECK-NEXT: max.u32 %r10, %r1, %r5; ; CHECK-NEXT: max.u32 %r11, %r4, %r8; @@ -1207,9 +1207,9 @@ define i32 @reduce_umax_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_umax_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_umax_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umax_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umax_i32_nonpow2_param_0]; ; CHECK-NEXT: max.u32 %r8, %r3, %r7; ; CHECK-NEXT: max.u32 %r9, %r1, %r5; ; CHECK-NEXT: max.u32 %r10, %r9, %r8; @@ -1229,7 +1229,7 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<6>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; ; CHECK-SM80-NEXT: min.u16 %rs5, %rs3, %rs1; @@ -1251,7 +1251,7 @@ define i16 @reduce_umin_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i16_param_0]; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.u16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.u16x2 %r7, %r6, %r5; @@ -1274,10 +1274,10 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<3>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.u32 %r1, [reduce_umin_i16_nonpow2_param_0+8]; +; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8]; ; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM80-NEXT: ld.param.u16 %rs7, [reduce_umin_i16_nonpow2_param_0+12]; -; CHECK-SM80-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0]; +; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12]; +; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0]; ; CHECK-SM80-NEXT: min.u16 %rs8, %rs3, %rs7; ; CHECK-SM80-NEXT: min.u16 %rs9, %rs1, %rs5; ; CHECK-SM80-NEXT: min.u16 %rs10, %rs9, %rs8; @@ -1294,12 +1294,12 @@ define i16 @reduce_umin_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.u32 %r1, [reduce_umin_i16_nonpow2_param_0+8]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_umin_i16_nonpow2_param_0+8]; ; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM100-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_umin_i16_nonpow2_param_0]; ; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-SM100-NEXT: ld.param.u16 %rs7, [reduce_umin_i16_nonpow2_param_0+12]; +; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_umin_i16_nonpow2_param_0+12]; ; CHECK-SM100-NEXT: mov.b16 %rs8, -1; ; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-SM100-NEXT: min.u16x2 %r5, %r3, %r4; @@ -1320,8 +1320,8 @@ define i32 @reduce_umin_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_umin_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_param_0]; ; CHECK-NEXT: min.u32 %r9, %r3, %r7; ; CHECK-NEXT: min.u32 %r10, %r1, %r5; ; CHECK-NEXT: min.u32 %r11, %r4, %r8; @@ -1341,9 +1341,9 @@ define i32 @reduce_umin_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_umin_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_umin_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_umin_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_umin_i32_nonpow2_param_0]; ; CHECK-NEXT: min.u32 %r8, %r3, %r7; ; CHECK-NEXT: min.u32 %r9, %r1, %r5; ; CHECK-NEXT: min.u32 %r10, %r9, %r8; @@ -1363,7 +1363,7 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<6>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; ; CHECK-SM80-NEXT: max.s16 %rs5, %rs3, %rs1; @@ -1385,7 +1385,7 @@ define i16 @reduce_smax_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i16_param_0]; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: max.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: max.s16x2 %r7, %r6, %r5; @@ -1408,10 +1408,10 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<3>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.u32 %r1, [reduce_smax_i16_nonpow2_param_0+8]; +; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8]; ; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM80-NEXT: ld.param.u16 %rs7, [reduce_smax_i16_nonpow2_param_0+12]; -; CHECK-SM80-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0]; +; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12]; +; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0]; ; CHECK-SM80-NEXT: max.s16 %rs8, %rs3, %rs7; ; CHECK-SM80-NEXT: max.s16 %rs9, %rs1, %rs5; ; CHECK-SM80-NEXT: max.s16 %rs10, %rs9, %rs8; @@ -1428,12 +1428,12 @@ define i16 @reduce_smax_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.u32 %r1, [reduce_smax_i16_nonpow2_param_0+8]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_smax_i16_nonpow2_param_0+8]; ; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM100-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smax_i16_nonpow2_param_0]; ; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-SM100-NEXT: ld.param.u16 %rs7, [reduce_smax_i16_nonpow2_param_0+12]; +; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_smax_i16_nonpow2_param_0+12]; ; CHECK-SM100-NEXT: mov.b16 %rs8, -32768; ; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-SM100-NEXT: max.s16x2 %r5, %r3, %r4; @@ -1454,8 +1454,8 @@ define i32 @reduce_smax_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smax_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_param_0]; ; CHECK-NEXT: max.s32 %r9, %r3, %r7; ; CHECK-NEXT: max.s32 %r10, %r1, %r5; ; CHECK-NEXT: max.s32 %r11, %r4, %r8; @@ -1475,9 +1475,9 @@ define i32 @reduce_smax_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_smax_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_smax_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smax_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smax_i32_nonpow2_param_0]; ; CHECK-NEXT: max.s32 %r8, %r3, %r7; ; CHECK-NEXT: max.s32 %r9, %r1, %r5; ; CHECK-NEXT: max.s32 %r10, %r9, %r8; @@ -1497,7 +1497,7 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<6>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM80-NEXT: mov.b32 {%rs1, %rs2}, %r4; ; CHECK-SM80-NEXT: mov.b32 {%rs3, %rs4}, %r2; ; CHECK-SM80-NEXT: min.s16 %rs5, %rs3, %rs1; @@ -1519,7 +1519,7 @@ define i16 @reduce_smin_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i16_param_0]; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r2, %r4; ; CHECK-SM100-NEXT: min.s16x2 %r6, %r1, %r3; ; CHECK-SM100-NEXT: min.s16x2 %r7, %r6, %r5; @@ -1542,10 +1542,10 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<3>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.u32 %r1, [reduce_smin_i16_nonpow2_param_0+8]; +; CHECK-SM80-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8]; ; CHECK-SM80-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM80-NEXT: ld.param.u16 %rs7, [reduce_smin_i16_nonpow2_param_0+12]; -; CHECK-SM80-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0]; +; CHECK-SM80-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12]; +; CHECK-SM80-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0]; ; CHECK-SM80-NEXT: min.s16 %rs8, %rs3, %rs7; ; CHECK-SM80-NEXT: min.s16 %rs9, %rs1, %rs5; ; CHECK-SM80-NEXT: min.s16 %rs10, %rs9, %rs8; @@ -1562,12 +1562,12 @@ define i16 @reduce_smin_i16_nonpow2(<7 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<9>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.u32 %r1, [reduce_smin_i16_nonpow2_param_0+8]; +; CHECK-SM100-NEXT: ld.param.b32 %r1, [reduce_smin_i16_nonpow2_param_0+8]; ; CHECK-SM100-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-SM100-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_smin_i16_nonpow2_param_0]; ; CHECK-SM100-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-SM100-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-SM100-NEXT: ld.param.u16 %rs7, [reduce_smin_i16_nonpow2_param_0+12]; +; CHECK-SM100-NEXT: ld.param.b16 %rs7, [reduce_smin_i16_nonpow2_param_0+12]; ; CHECK-SM100-NEXT: mov.b16 %rs8, 32767; ; CHECK-SM100-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-SM100-NEXT: min.s16x2 %r5, %r3, %r4; @@ -1588,8 +1588,8 @@ define i32 @reduce_smin_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_smin_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_param_0]; ; CHECK-NEXT: min.s32 %r9, %r3, %r7; ; CHECK-NEXT: min.s32 %r10, %r1, %r5; ; CHECK-NEXT: min.s32 %r11, %r4, %r8; @@ -1609,9 +1609,9 @@ define i32 @reduce_smin_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_smin_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_smin_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_smin_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_smin_i32_nonpow2_param_0]; ; CHECK-NEXT: min.s32 %r8, %r3, %r7; ; CHECK-NEXT: min.s32 %r9, %r1, %r5; ; CHECK-NEXT: min.s32 %r10, %r9, %r8; @@ -1631,7 +1631,7 @@ define i16 @reduce_and_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<11>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; ; CHECK-SM80-NEXT: and.b32 %r5, %r2, %r4; ; CHECK-SM80-NEXT: and.b32 %r6, %r1, %r3; ; CHECK-SM80-NEXT: and.b32 %r7, %r6, %r5; @@ -1650,7 +1650,7 @@ define i16 @reduce_and_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i16_param_0]; ; CHECK-SM100-NEXT: and.b32 %r5, %r2, %r4; ; CHECK-SM100-NEXT: and.b32 %r6, %r1, %r3; ; CHECK-SM100-NEXT: and.b32 %r7, %r6, %r5; @@ -1673,12 +1673,12 @@ define i16 @reduce_and_i16_nonpow2(<7 x i16> %in) { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [reduce_and_i16_nonpow2_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r1, [reduce_and_i16_nonpow2_param_0+8]; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_and_i16_nonpow2_param_0]; ; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-NEXT: ld.param.u16 %rs7, [reduce_and_i16_nonpow2_param_0+12]; +; CHECK-NEXT: ld.param.b16 %rs7, [reduce_and_i16_nonpow2_param_0+12]; ; CHECK-NEXT: mov.b16 %rs8, -1; ; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; @@ -1699,8 +1699,8 @@ define i32 @reduce_and_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_and_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_param_0]; ; CHECK-NEXT: and.b32 %r9, %r3, %r7; ; CHECK-NEXT: and.b32 %r10, %r1, %r5; ; CHECK-NEXT: and.b32 %r11, %r4, %r8; @@ -1720,9 +1720,9 @@ define i32 @reduce_and_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_and_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_and_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_and_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_and_i32_nonpow2_param_0]; ; CHECK-NEXT: and.b32 %r8, %r3, %r7; ; CHECK-NEXT: and.b32 %r9, %r1, %r5; ; CHECK-NEXT: and.b32 %r10, %r9, %r8; @@ -1742,7 +1742,7 @@ define i16 @reduce_or_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<11>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; ; CHECK-SM80-NEXT: or.b32 %r5, %r2, %r4; ; CHECK-SM80-NEXT: or.b32 %r6, %r1, %r3; ; CHECK-SM80-NEXT: or.b32 %r7, %r6, %r5; @@ -1761,7 +1761,7 @@ define i16 @reduce_or_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i16_param_0]; ; CHECK-SM100-NEXT: or.b32 %r5, %r2, %r4; ; CHECK-SM100-NEXT: or.b32 %r6, %r1, %r3; ; CHECK-SM100-NEXT: or.b32 %r7, %r6, %r5; @@ -1784,12 +1784,12 @@ define i16 @reduce_or_i16_nonpow2(<7 x i16> %in) { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [reduce_or_i16_nonpow2_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r1, [reduce_or_i16_nonpow2_param_0+8]; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_or_i16_nonpow2_param_0]; ; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-NEXT: ld.param.u16 %rs7, [reduce_or_i16_nonpow2_param_0+12]; +; CHECK-NEXT: ld.param.b16 %rs7, [reduce_or_i16_nonpow2_param_0+12]; ; CHECK-NEXT: mov.b16 %rs8, 0; ; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-NEXT: or.b32 %r5, %r3, %r4; @@ -1810,8 +1810,8 @@ define i32 @reduce_or_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_or_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_param_0]; ; CHECK-NEXT: or.b32 %r9, %r3, %r7; ; CHECK-NEXT: or.b32 %r10, %r1, %r5; ; CHECK-NEXT: or.b32 %r11, %r4, %r8; @@ -1831,9 +1831,9 @@ define i32 @reduce_or_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_or_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_or_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_or_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_or_i32_nonpow2_param_0]; ; CHECK-NEXT: or.b32 %r8, %r3, %r7; ; CHECK-NEXT: or.b32 %r9, %r1, %r5; ; CHECK-NEXT: or.b32 %r10, %r9, %r8; @@ -1853,7 +1853,7 @@ define i16 @reduce_xor_i16(<8 x i16> %in) { ; CHECK-SM80-NEXT: .reg .b32 %r<11>; ; CHECK-SM80-EMPTY: ; CHECK-SM80-NEXT: // %bb.0: -; CHECK-SM80-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-SM80-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; ; CHECK-SM80-NEXT: xor.b32 %r5, %r2, %r4; ; CHECK-SM80-NEXT: xor.b32 %r6, %r1, %r3; ; CHECK-SM80-NEXT: xor.b32 %r7, %r6, %r5; @@ -1872,7 +1872,7 @@ define i16 @reduce_xor_i16(<8 x i16> %in) { ; CHECK-SM100-NEXT: .reg .b32 %r<11>; ; CHECK-SM100-EMPTY: ; CHECK-SM100-NEXT: // %bb.0: -; CHECK-SM100-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; +; CHECK-SM100-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i16_param_0]; ; CHECK-SM100-NEXT: xor.b32 %r5, %r2, %r4; ; CHECK-SM100-NEXT: xor.b32 %r6, %r1, %r3; ; CHECK-SM100-NEXT: xor.b32 %r7, %r6, %r5; @@ -1895,12 +1895,12 @@ define i16 @reduce_xor_i16_nonpow2(<7 x i16> %in) { ; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [reduce_xor_i16_nonpow2_param_0+8]; +; CHECK-NEXT: ld.param.b32 %r1, [reduce_xor_i16_nonpow2_param_0+8]; ; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r1; -; CHECK-NEXT: ld.param.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0]; +; CHECK-NEXT: ld.param.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [reduce_xor_i16_nonpow2_param_0]; ; CHECK-NEXT: mov.b32 %r2, {%rs1, %rs2}; ; CHECK-NEXT: mov.b32 %r3, {%rs3, %rs4}; -; CHECK-NEXT: ld.param.u16 %rs7, [reduce_xor_i16_nonpow2_param_0+12]; +; CHECK-NEXT: ld.param.b16 %rs7, [reduce_xor_i16_nonpow2_param_0+12]; ; CHECK-NEXT: mov.b16 %rs8, 0; ; CHECK-NEXT: mov.b32 %r4, {%rs7, %rs8}; ; CHECK-NEXT: xor.b32 %r5, %r3, %r4; @@ -1921,8 +1921,8 @@ define i32 @reduce_xor_i32(<8 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<16>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [reduce_xor_i32_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_param_0]; ; CHECK-NEXT: xor.b32 %r9, %r3, %r7; ; CHECK-NEXT: xor.b32 %r10, %r1, %r5; ; CHECK-NEXT: xor.b32 %r11, %r4, %r8; @@ -1942,9 +1942,9 @@ define i32 @reduce_xor_i32_nonpow2(<7 x i32> %in) { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r7, [reduce_xor_i32_nonpow2_param_0+24]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0]; +; CHECK-NEXT: ld.param.b32 %r7, [reduce_xor_i32_nonpow2_param_0+24]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [reduce_xor_i32_nonpow2_param_0+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [reduce_xor_i32_nonpow2_param_0]; ; CHECK-NEXT: xor.b32 %r8, %r3, %r7; ; CHECK-NEXT: xor.b32 %r9, %r1, %r5; ; CHECK-NEXT: xor.b32 %r10, %r9, %r8; diff --git a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll index ed785298f590..dbc10757dc43 100644 --- a/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll +++ b/llvm/test/CodeGen/NVPTX/redux-sync-f32.ll @@ -10,10 +10,10 @@ define float @redux_sync_fmin(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmin_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmin_param_1]; ; CHECK-NEXT: redux.sync.min.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmin(float %src, i32 %mask) ret float %val @@ -27,10 +27,10 @@ define float @redux_sync_fmin_abs(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_abs_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_abs_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmin_abs_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmin_abs_param_1]; ; CHECK-NEXT: redux.sync.min.abs.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmin.abs(float %src, i32 %mask) ret float %val @@ -44,10 +44,10 @@ define float @redux_sync_fmin_NaN(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_NaN_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_NaN_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmin_NaN_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmin_NaN_param_1]; ; CHECK-NEXT: redux.sync.min.NaN.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmin.NaN(float %src, i32 %mask) ret float %val @@ -61,10 +61,10 @@ define float @redux_sync_fmin_abs_NaN(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmin_abs_NaN_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmin_abs_NaN_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmin_abs_NaN_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmin_abs_NaN_param_1]; ; CHECK-NEXT: redux.sync.min.abs.NaN.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmin.abs.NaN(float %src, i32 %mask) ret float %val @@ -78,10 +78,10 @@ define float @redux_sync_fmax(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmax_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmax_param_1]; ; CHECK-NEXT: redux.sync.max.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmax(float %src, i32 %mask) ret float %val @@ -95,10 +95,10 @@ define float @redux_sync_fmax_abs(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_abs_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_abs_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmax_abs_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmax_abs_param_1]; ; CHECK-NEXT: redux.sync.max.abs.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmax.abs(float %src, i32 %mask) ret float %val @@ -112,10 +112,10 @@ define float @redux_sync_fmax_NaN(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_NaN_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_NaN_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmax_NaN_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmax_NaN_param_1]; ; CHECK-NEXT: redux.sync.max.NaN.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmax.NaN(float %src, i32 %mask) ret float %val @@ -129,10 +129,10 @@ define float @redux_sync_fmax_abs_NaN(float %src, i32 %mask) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [redux_sync_fmax_abs_NaN_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [redux_sync_fmax_abs_NaN_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [redux_sync_fmax_abs_NaN_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [redux_sync_fmax_abs_NaN_param_1]; ; CHECK-NEXT: redux.sync.max.abs.NaN.f32 %f2, %f1, %r1; -; CHECK-NEXT: st.param.f32 [func_retval0], %f2; +; CHECK-NEXT: st.param.b32 [func_retval0], %f2; ; CHECK-NEXT: ret; %val = call float @llvm.nvvm.redux.sync.fmax.abs.NaN(float %src, i32 %mask) ret float %val diff --git a/llvm/test/CodeGen/NVPTX/reg-types.ll b/llvm/test/CodeGen/NVPTX/reg-types.ll index 7b4ebcae8a67..4b50bca7f8ef 100644 --- a/llvm/test/CodeGen/NVPTX/reg-types.ll +++ b/llvm/test/CodeGen/NVPTX/reg-types.ll @@ -32,28 +32,28 @@ entry: ; Verify that we use correct register types. store i8 1, ptr %s8, align 1 ; CHECK: mov.b16 [[R1:%rs[0-9]]], 1; -; CHECK-NEXT: st.u8 {{.*}}, [[R1]] +; CHECK-NEXT: st.b8 {{.*}}, [[R1]] store i8 2, ptr %u8, align 1 ; CHECK: mov.b16 [[R2:%rs[0-9]]], 2; -; CHECK-NEXT: st.u8 {{.*}}, [[R2]] +; CHECK-NEXT: st.b8 {{.*}}, [[R2]] store i16 3, ptr %s16, align 2 ; CHECK: mov.b16 [[R3:%rs[0-9]]], 3; -; CHECK-NEXT: st.u16 {{.*}}, [[R3]] +; CHECK-NEXT: st.b16 {{.*}}, [[R3]] store i16 4, ptr %u16, align 2 ; CHECK: mov.b16 [[R4:%rs[0-9]]], 4; -; CHECK-NEXT: st.u16 {{.*}}, [[R4]] +; CHECK-NEXT: st.b16 {{.*}}, [[R4]] store i32 5, ptr %s32, align 4 ; CHECK: mov.b32 [[R5:%r[0-9]]], 5; -; CHECK-NEXT: st.u32 {{.*}}, [[R5]] +; CHECK-NEXT: st.b32 {{.*}}, [[R5]] store i32 6, ptr %u32, align 4 ; CHECK: mov.b32 [[R6:%r[0-9]]], 6; -; CHECK-NEXT: st.u32 {{.*}}, [[R6]] +; CHECK-NEXT: st.b32 {{.*}}, [[R6]] store i64 7, ptr %s64, align 8 ; CHECK: mov.b64 [[R7:%rd[0-9]]], 7; -; CHECK-NEXT: st.u64 {{.*}}, [[R7]] +; CHECK-NEXT: st.b64 {{.*}}, [[R7]] store i64 8, ptr %u64, align 8 ; CHECK: mov.b64 [[R8:%rd[0-9]]], 8; -; CHECK-NEXT: st.u64 {{.*}}, [[R8]] +; CHECK-NEXT: st.b64 {{.*}}, [[R8]] ; FP constants are stored via integer registers, but that's an ; implementation detail that's irrelevant here. @@ -61,9 +61,9 @@ entry: store double 1.000000e+01, ptr %f64, align 8 ; Instead, we force a load into a register and then verify register type. %f32v = load volatile float, ptr %f32, align 4 -; CHECK: ld.volatile.f32 %f{{[0-9]+}} +; CHECK: ld.volatile.b32 %f{{[0-9]+}} %f64v = load volatile double, ptr %f64, align 8 -; CHECK: ld.volatile.f64 %fd{{[0-9]+}} +; CHECK: ld.volatile.b64 %fd{{[0-9]+}} ret void ; CHECK: ret; ; NO8BIT: ret; diff --git a/llvm/test/CodeGen/NVPTX/rotate-add.ll b/llvm/test/CodeGen/NVPTX/rotate-add.ll index 820e8000a565..aada7eadce2a 100644 --- a/llvm/test/CodeGen/NVPTX/rotate-add.ll +++ b/llvm/test/CodeGen/NVPTX/rotate-add.ll @@ -10,7 +10,7 @@ define i32 @test_simple_rotl(i32 %x) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_simple_rotl_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_simple_rotl_param_0]; ; CHECK-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -26,7 +26,7 @@ define i32 @test_simple_rotr(i32 %x) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_simple_rotr_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_simple_rotr_param_0]; ; CHECK-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 25; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -42,8 +42,8 @@ define i32 @test_rotl_var(i32 %x, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_var_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_rotl_var_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_rotl_var_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_rotl_var_param_1]; ; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -60,8 +60,8 @@ define i32 @test_rotr_var(i32 %x, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_rotr_var_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_rotr_var_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_rotr_var_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_rotr_var_param_1]; ; CHECK-NEXT: shf.r.wrap.b32 %r3, %r1, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -78,8 +78,8 @@ define i32 @test_invalid_rotl_var_and(i32 %x, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_invalid_rotl_var_and_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_invalid_rotl_var_and_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_invalid_rotl_var_and_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_invalid_rotl_var_and_param_1]; ; CHECK-NEXT: shl.b32 %r3, %r1, %r2; ; CHECK-NEXT: neg.s32 %r4, %r2; ; CHECK-NEXT: and.b32 %r5, %r4, 31; @@ -101,8 +101,8 @@ define i32 @test_invalid_rotr_var_and(i32 %x, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_invalid_rotr_var_and_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_invalid_rotr_var_and_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_invalid_rotr_var_and_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_invalid_rotr_var_and_param_1]; ; CHECK-NEXT: shr.u32 %r3, %r1, %r2; ; CHECK-NEXT: neg.s32 %r4, %r2; ; CHECK-NEXT: and.b32 %r5, %r4, 31; @@ -124,9 +124,9 @@ define i32 @test_fshl_special_case(i32 %x0, i32 %x1, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_special_case_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_fshl_special_case_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_fshl_special_case_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fshl_special_case_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_fshl_special_case_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_fshl_special_case_param_2]; ; CHECK-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -144,9 +144,9 @@ define i32 @test_fshr_special_case(i32 %x0, i32 %x1, i32 %y) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_fshr_special_case_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_fshr_special_case_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_fshr_special_case_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fshr_special_case_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_fshr_special_case_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_fshr_special_case_param_2]; ; CHECK-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -165,7 +165,7 @@ define i64 @test_rotl_udiv_special_case(i64 %i) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_rotl_udiv_special_case_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_rotl_udiv_special_case_param_0]; ; CHECK-NEXT: mul.hi.u64 %rd2, %rd1, -6148914691236517205; ; CHECK-NEXT: shr.u64 %rd3, %rd2, 1; ; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd3; @@ -187,7 +187,7 @@ define i32 @test_rotl_mul_special_case(i32 %i) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_rotl_mul_special_case_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_rotl_mul_special_case_param_0]; ; CHECK-NEXT: mul.lo.s32 %r2, %r1, 9; ; CHECK-NEXT: shf.l.wrap.b32 %r3, %r2, %r2, 7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; @@ -206,7 +206,7 @@ define i64 @test_rotl_mul_with_mask_special_case(i64 %i) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_rotl_mul_with_mask_special_case_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_rotl_mul_with_mask_special_case_param_0]; ; CHECK-NEXT: mul.lo.s64 %rd2, %rd1, 9; ; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: mov.b64 {%r3, %r4}, %rd2; @@ -230,7 +230,7 @@ define i32 @test_fshl_with_mask_special_case(i32 %x) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_fshl_with_mask_special_case_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_fshl_with_mask_special_case_param_0]; ; CHECK-NEXT: or.b32 %r2, %r1, 1; ; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 5; ; CHECK-NEXT: and.b32 %r4, %r3, -31; diff --git a/llvm/test/CodeGen/NVPTX/rotate.ll b/llvm/test/CodeGen/NVPTX/rotate.ll index f77fb4115567..2d7fa40e5be7 100644 --- a/llvm/test/CodeGen/NVPTX/rotate.ll +++ b/llvm/test/CodeGen/NVPTX/rotate.ll @@ -21,8 +21,8 @@ define i32 @rotate32(i32 %a, i32 %b) { ; SM20-NEXT: .reg .b32 %r<9>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u32 %r1, [rotate32_param_0]; -; SM20-NEXT: ld.param.u32 %r2, [rotate32_param_1]; +; SM20-NEXT: ld.param.b32 %r1, [rotate32_param_0]; +; SM20-NEXT: ld.param.b32 %r2, [rotate32_param_1]; ; SM20-NEXT: and.b32 %r3, %r2, 31; ; SM20-NEXT: shl.b32 %r4, %r1, %r3; ; SM20-NEXT: neg.s32 %r5, %r2; @@ -37,8 +37,8 @@ define i32 @rotate32(i32 %a, i32 %b) { ; SM35-NEXT: .reg .b32 %r<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u32 %r1, [rotate32_param_0]; -; SM35-NEXT: ld.param.u32 %r2, [rotate32_param_1]; +; SM35-NEXT: ld.param.b32 %r1, [rotate32_param_0]; +; SM35-NEXT: ld.param.b32 %r2, [rotate32_param_1]; ; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r1, %r2; ; SM35-NEXT: st.param.b32 [func_retval0], %r3; ; SM35-NEXT: ret; @@ -53,8 +53,8 @@ define i64 @rotate64(i64 %a, i32 %b) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [rotate64_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [rotate64_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [rotate64_param_1]; ; SM20-NEXT: and.b32 %r2, %r1, 63; ; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; ; SM20-NEXT: neg.s32 %r3, %r1; @@ -70,8 +70,8 @@ define i64 @rotate64(i64 %a, i32 %b) { ; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [rotate64_param_1]; +; SM35-NEXT: ld.param.b64 %rd1, [rotate64_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [rotate64_param_1]; ; SM35-NEXT: and.b32 %r2, %r1, 63; ; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; ; SM35-NEXT: neg.s32 %r3, %r1; @@ -91,8 +91,8 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [rotateright64_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [rotateright64_param_1]; ; SM20-NEXT: and.b32 %r2, %r1, 63; ; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; ; SM20-NEXT: neg.s32 %r3, %r1; @@ -108,8 +108,8 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [rotateright64_param_1]; +; SM35-NEXT: ld.param.b64 %rd1, [rotateright64_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [rotateright64_param_1]; ; SM35-NEXT: and.b32 %r2, %r1, 63; ; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; ; SM35-NEXT: neg.s32 %r3, %r1; @@ -128,7 +128,7 @@ define i32 @rotl0(i32 %x) { ; SM20-NEXT: .reg .b32 %r<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u32 %r1, [rotl0_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [rotl0_param_0]; ; SM20-NEXT: shr.u32 %r2, %r1, 24; ; SM20-NEXT: shl.b32 %r3, %r1, 8; ; SM20-NEXT: or.b32 %r4, %r3, %r2; @@ -140,7 +140,7 @@ define i32 @rotl0(i32 %x) { ; SM35-NEXT: .reg .b32 %r<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u32 %r1, [rotl0_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [rotl0_param_0]; ; SM35-NEXT: shf.l.wrap.b32 %r2, %r1, %r1, 8; ; SM35-NEXT: st.param.b32 [func_retval0], %r2; ; SM35-NEXT: ret; @@ -157,8 +157,8 @@ define i64 @rotl64(i64 %a, i64 %n) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [rotl64_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [rotl64_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [rotl64_param_1]; ; SM20-NEXT: and.b32 %r2, %r1, 63; ; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; ; SM20-NEXT: neg.s32 %r3, %r1; @@ -174,8 +174,8 @@ define i64 @rotl64(i64 %a, i64 %n) { ; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotl64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [rotl64_param_1]; +; SM35-NEXT: ld.param.b64 %rd1, [rotl64_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [rotl64_param_1]; ; SM35-NEXT: and.b32 %r2, %r1, 63; ; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; ; SM35-NEXT: neg.s32 %r3, %r1; @@ -194,7 +194,7 @@ define i64 @rotl64_low_imm(i64 %a) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [rotl64_low_imm_param_0]; ; SM20-NEXT: shr.u64 %rd2, %rd1, 62; ; SM20-NEXT: shl.b64 %rd3, %rd1, 2; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; @@ -207,7 +207,7 @@ define i64 @rotl64_low_imm(i64 %a) { ; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotl64_low_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [rotl64_low_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; ; SM35-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 2; ; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 2; @@ -224,7 +224,7 @@ define i64 @rotl64_high_imm(i64 %a) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [rotl64_high_imm_param_0]; ; SM20-NEXT: shr.u64 %rd2, %rd1, 1; ; SM20-NEXT: shl.b64 %rd3, %rd1, 63; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; @@ -237,7 +237,7 @@ define i64 @rotl64_high_imm(i64 %a) { ; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotl64_high_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [rotl64_high_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; ; SM35-NEXT: shf.l.wrap.b32 %r3, %r2, %r1, 31; ; SM35-NEXT: shf.l.wrap.b32 %r4, %r1, %r2, 31; @@ -254,7 +254,7 @@ define i64 @rotl64_32_imm(i64 %a) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [rotl64_32_imm_param_0]; ; SM20-NEXT: shr.u64 %rd2, %rd1, 32; ; SM20-NEXT: shl.b64 %rd3, %rd1, 32; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; @@ -267,7 +267,7 @@ define i64 @rotl64_32_imm(i64 %a) { ; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotl64_32_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [rotl64_32_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; ; SM35-NEXT: mov.b64 %rd2, {%r2, %r1}; ; SM35-NEXT: st.param.b64 [func_retval0], %rd2; @@ -283,8 +283,8 @@ define i64 @rotr64(i64 %a, i64 %n) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [rotr64_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [rotr64_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [rotr64_param_1]; ; SM20-NEXT: and.b32 %r2, %r1, 63; ; SM20-NEXT: shr.u64 %rd2, %rd1, %r2; ; SM20-NEXT: neg.s32 %r3, %r1; @@ -300,8 +300,8 @@ define i64 @rotr64(i64 %a, i64 %n) { ; SM35-NEXT: .reg .b64 %rd<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotr64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [rotr64_param_1]; +; SM35-NEXT: ld.param.b64 %rd1, [rotr64_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [rotr64_param_1]; ; SM35-NEXT: and.b32 %r2, %r1, 63; ; SM35-NEXT: shr.u64 %rd2, %rd1, %r2; ; SM35-NEXT: neg.s32 %r3, %r1; @@ -320,7 +320,7 @@ define i64 @rotr64_low_imm(i64 %a) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [rotr64_low_imm_param_0]; ; SM20-NEXT: shl.b64 %rd2, %rd1, 52; ; SM20-NEXT: shr.u64 %rd3, %rd1, 12; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; @@ -333,7 +333,7 @@ define i64 @rotr64_low_imm(i64 %a) { ; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotr64_low_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [rotr64_low_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; ; SM35-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 12; ; SM35-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 12; @@ -350,7 +350,7 @@ define i64 @rotr64_high_imm(i64 %a) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [rotr64_high_imm_param_0]; ; SM20-NEXT: shl.b64 %rd2, %rd1, 21; ; SM20-NEXT: shr.u64 %rd3, %rd1, 43; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; @@ -363,7 +363,7 @@ define i64 @rotr64_high_imm(i64 %a) { ; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotr64_high_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [rotr64_high_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; ; SM35-NEXT: shf.r.wrap.b32 %r3, %r1, %r2, 11; ; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, 11; @@ -380,7 +380,7 @@ define i64 @rotr64_32_imm(i64 %a) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [rotr64_32_imm_param_0]; ; SM20-NEXT: shl.b64 %rd2, %rd1, 32; ; SM20-NEXT: shr.u64 %rd3, %rd1, 32; ; SM20-NEXT: or.b64 %rd4, %rd3, %rd2; @@ -393,7 +393,7 @@ define i64 @rotr64_32_imm(i64 %a) { ; SM35-NEXT: .reg .b64 %rd<3>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [rotr64_32_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [rotr64_32_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; ; SM35-NEXT: mov.b64 %rd2, {%r2, %r1}; ; SM35-NEXT: st.param.b64 [func_retval0], %rd2; @@ -408,10 +408,10 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { ; SM20-NEXT: .reg .b32 %r<11>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; -; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_2]; +; SM20-NEXT: ld.param.b32 %r1, [funnel_shift_right_32_param_0]; +; SM20-NEXT: ld.param.b32 %r2, [funnel_shift_right_32_param_2]; ; SM20-NEXT: and.b32 %r3, %r2, 31; -; SM20-NEXT: ld.param.u32 %r4, [funnel_shift_right_32_param_1]; +; SM20-NEXT: ld.param.b32 %r4, [funnel_shift_right_32_param_1]; ; SM20-NEXT: shr.u32 %r5, %r4, %r3; ; SM20-NEXT: shl.b32 %r6, %r1, 1; ; SM20-NEXT: not.b32 %r7, %r2; @@ -426,9 +426,9 @@ define i32 @funnel_shift_right_32(i32 %a, i32 %b, i32 %c) { ; SM35-NEXT: .reg .b32 %r<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_32_param_0]; -; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_right_32_param_1]; -; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_right_32_param_2]; +; SM35-NEXT: ld.param.b32 %r1, [funnel_shift_right_32_param_0]; +; SM35-NEXT: ld.param.b32 %r2, [funnel_shift_right_32_param_1]; +; SM35-NEXT: ld.param.b32 %r3, [funnel_shift_right_32_param_2]; ; SM35-NEXT: shf.r.wrap.b32 %r4, %r2, %r1, %r3; ; SM35-NEXT: st.param.b32 [func_retval0], %r4; ; SM35-NEXT: ret; @@ -442,11 +442,11 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { ; SM20-NEXT: .reg .b32 %r<11>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; -; SM20-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_2]; +; SM20-NEXT: ld.param.b32 %r1, [funnel_shift_left_32_param_0]; +; SM20-NEXT: ld.param.b32 %r2, [funnel_shift_left_32_param_2]; ; SM20-NEXT: and.b32 %r3, %r2, 31; ; SM20-NEXT: shl.b32 %r4, %r1, %r3; -; SM20-NEXT: ld.param.u32 %r5, [funnel_shift_left_32_param_1]; +; SM20-NEXT: ld.param.b32 %r5, [funnel_shift_left_32_param_1]; ; SM20-NEXT: shr.u32 %r6, %r5, 1; ; SM20-NEXT: not.b32 %r7, %r2; ; SM20-NEXT: and.b32 %r8, %r7, 31; @@ -460,9 +460,9 @@ define i32 @funnel_shift_left_32(i32 %a, i32 %b, i32 %c) { ; SM35-NEXT: .reg .b32 %r<5>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_32_param_0]; -; SM35-NEXT: ld.param.u32 %r2, [funnel_shift_left_32_param_1]; -; SM35-NEXT: ld.param.u32 %r3, [funnel_shift_left_32_param_2]; +; SM35-NEXT: ld.param.b32 %r1, [funnel_shift_left_32_param_0]; +; SM35-NEXT: ld.param.b32 %r2, [funnel_shift_left_32_param_1]; +; SM35-NEXT: ld.param.b32 %r3, [funnel_shift_left_32_param_2]; ; SM35-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, %r3; ; SM35-NEXT: st.param.b32 [func_retval0], %r4; ; SM35-NEXT: ret; @@ -477,10 +477,10 @@ define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { ; SM20-NEXT: .reg .b64 %rd<7>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; +; SM20-NEXT: ld.param.b64 %rd1, [funnel_shift_right_64_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [funnel_shift_right_64_param_2]; ; SM20-NEXT: and.b32 %r2, %r1, 63; -; SM20-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; +; SM20-NEXT: ld.param.b64 %rd2, [funnel_shift_right_64_param_1]; ; SM20-NEXT: shr.u64 %rd3, %rd2, %r2; ; SM20-NEXT: shl.b64 %rd4, %rd1, 1; ; SM20-NEXT: not.b32 %r3, %r1; @@ -496,10 +496,10 @@ define i64 @funnel_shift_right_64(i64 %a, i64 %b, i64 %c) { ; SM35-NEXT: .reg .b64 %rd<7>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_right_64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_right_64_param_2]; +; SM35-NEXT: ld.param.b64 %rd1, [funnel_shift_right_64_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [funnel_shift_right_64_param_2]; ; SM35-NEXT: and.b32 %r2, %r1, 63; -; SM35-NEXT: ld.param.u64 %rd2, [funnel_shift_right_64_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [funnel_shift_right_64_param_1]; ; SM35-NEXT: shr.u64 %rd3, %rd2, %r2; ; SM35-NEXT: shl.b64 %rd4, %rd1, 1; ; SM35-NEXT: not.b32 %r3, %r1; @@ -519,11 +519,11 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { ; SM20-NEXT: .reg .b64 %rd<7>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; -; SM20-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; +; SM20-NEXT: ld.param.b64 %rd1, [funnel_shift_left_64_param_0]; +; SM20-NEXT: ld.param.b32 %r1, [funnel_shift_left_64_param_2]; ; SM20-NEXT: and.b32 %r2, %r1, 63; ; SM20-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM20-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; +; SM20-NEXT: ld.param.b64 %rd3, [funnel_shift_left_64_param_1]; ; SM20-NEXT: shr.u64 %rd4, %rd3, 1; ; SM20-NEXT: not.b32 %r3, %r1; ; SM20-NEXT: and.b32 %r4, %r3, 63; @@ -538,11 +538,11 @@ define i64 @funnel_shift_left_64(i64 %a, i64 %b, i64 %c) { ; SM35-NEXT: .reg .b64 %rd<7>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [funnel_shift_left_64_param_0]; -; SM35-NEXT: ld.param.u32 %r1, [funnel_shift_left_64_param_2]; +; SM35-NEXT: ld.param.b64 %rd1, [funnel_shift_left_64_param_0]; +; SM35-NEXT: ld.param.b32 %r1, [funnel_shift_left_64_param_2]; ; SM35-NEXT: and.b32 %r2, %r1, 63; ; SM35-NEXT: shl.b64 %rd2, %rd1, %r2; -; SM35-NEXT: ld.param.u64 %rd3, [funnel_shift_left_64_param_1]; +; SM35-NEXT: ld.param.b64 %rd3, [funnel_shift_left_64_param_1]; ; SM35-NEXT: shr.u64 %rd4, %rd3, 1; ; SM35-NEXT: not.b32 %r3, %r1; ; SM35-NEXT: and.b32 %r4, %r3, 63; @@ -560,8 +560,8 @@ define i64 @fshl64_low_imm(i64 %a, i64 %b) { ; SM20-NEXT: .reg .b64 %rd<6>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0]; -; SM20-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [fshl64_low_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd2, [fshl64_low_imm_param_1]; ; SM20-NEXT: shr.u64 %rd3, %rd2, 59; ; SM20-NEXT: shl.b64 %rd4, %rd1, 5; ; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; @@ -574,9 +574,9 @@ define i64 @fshl64_low_imm(i64 %a, i64 %b) { ; SM35-NEXT: .reg .b64 %rd<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [fshl64_low_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [fshl64_low_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; -; SM35-NEXT: ld.param.u64 %rd2, [fshl64_low_imm_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [fshl64_low_imm_param_1]; ; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; ; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 5; ; SM35-NEXT: shf.l.wrap.b32 %r6, %r1, %r2, 5; @@ -593,8 +593,8 @@ define i64 @fshl64_high_imm(i64 %a, i64 %b) { ; SM20-NEXT: .reg .b64 %rd<6>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0]; -; SM20-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [fshl64_high_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd2, [fshl64_high_imm_param_1]; ; SM20-NEXT: shr.u64 %rd3, %rd2, 9; ; SM20-NEXT: shl.b64 %rd4, %rd1, 55; ; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; @@ -607,9 +607,9 @@ define i64 @fshl64_high_imm(i64 %a, i64 %b) { ; SM35-NEXT: .reg .b64 %rd<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [fshl64_high_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [fshl64_high_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; -; SM35-NEXT: ld.param.u64 %rd2, [fshl64_high_imm_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [fshl64_high_imm_param_1]; ; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; ; SM35-NEXT: shf.l.wrap.b32 %r5, %r4, %r1, 23; ; SM35-NEXT: shf.l.wrap.b32 %r6, %r3, %r4, 23; @@ -626,9 +626,9 @@ define i64 @fshl64_32_imm(i64 %a, i64 %b) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [fshl64_32_imm_param_0]; ; SM20-NEXT: shl.b64 %rd2, %rd1, 32; -; SM20-NEXT: ld.param.u32 %rd3, [fshl64_32_imm_param_1+4]; +; SM20-NEXT: ld.param.b32 %rd3, [fshl64_32_imm_param_1+4]; ; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; ; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; @@ -639,9 +639,9 @@ define i64 @fshl64_32_imm(i64 %a, i64 %b) { ; SM35-NEXT: .reg .b64 %rd<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [fshl64_32_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [fshl64_32_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; -; SM35-NEXT: ld.param.u64 %rd2, [fshl64_32_imm_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [fshl64_32_imm_param_1]; ; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; ; SM35-NEXT: mov.b64 %rd3, {%r4, %r1}; ; SM35-NEXT: st.param.b64 [func_retval0], %rd3; @@ -656,8 +656,8 @@ define i64 @fshr64_low_imm(i64 %a, i64 %b) { ; SM20-NEXT: .reg .b64 %rd<6>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0]; -; SM20-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [fshr64_low_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd2, [fshr64_low_imm_param_1]; ; SM20-NEXT: shr.u64 %rd3, %rd2, 31; ; SM20-NEXT: shl.b64 %rd4, %rd1, 33; ; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; @@ -670,9 +670,9 @@ define i64 @fshr64_low_imm(i64 %a, i64 %b) { ; SM35-NEXT: .reg .b64 %rd<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [fshr64_low_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [fshr64_low_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; -; SM35-NEXT: ld.param.u64 %rd2, [fshr64_low_imm_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [fshr64_low_imm_param_1]; ; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; ; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 31; ; SM35-NEXT: shf.r.wrap.b32 %r6, %r3, %r4, 31; @@ -689,8 +689,8 @@ define i64 @fshr64_high_imm(i64 %a, i64 %b) { ; SM20-NEXT: .reg .b64 %rd<6>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0]; -; SM20-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1]; +; SM20-NEXT: ld.param.b64 %rd1, [fshr64_high_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd2, [fshr64_high_imm_param_1]; ; SM20-NEXT: shr.u64 %rd3, %rd2, 33; ; SM20-NEXT: shl.b64 %rd4, %rd1, 31; ; SM20-NEXT: or.b64 %rd5, %rd4, %rd3; @@ -703,9 +703,9 @@ define i64 @fshr64_high_imm(i64 %a, i64 %b) { ; SM35-NEXT: .reg .b64 %rd<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [fshr64_high_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [fshr64_high_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; -; SM35-NEXT: ld.param.u64 %rd2, [fshr64_high_imm_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [fshr64_high_imm_param_1]; ; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; ; SM35-NEXT: shf.r.wrap.b32 %r5, %r4, %r1, 1; ; SM35-NEXT: shf.r.wrap.b32 %r6, %r1, %r2, 1; @@ -722,9 +722,9 @@ define i64 @fshr64_32_imm(i64 %a, i64 %b) { ; SM20-NEXT: .reg .b64 %rd<5>; ; SM20-EMPTY: ; SM20-NEXT: // %bb.0: -; SM20-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0]; +; SM20-NEXT: ld.param.b64 %rd1, [fshr64_32_imm_param_0]; ; SM20-NEXT: shl.b64 %rd2, %rd1, 32; -; SM20-NEXT: ld.param.u32 %rd3, [fshr64_32_imm_param_1+4]; +; SM20-NEXT: ld.param.b32 %rd3, [fshr64_32_imm_param_1+4]; ; SM20-NEXT: or.b64 %rd4, %rd2, %rd3; ; SM20-NEXT: st.param.b64 [func_retval0], %rd4; ; SM20-NEXT: ret; @@ -735,9 +735,9 @@ define i64 @fshr64_32_imm(i64 %a, i64 %b) { ; SM35-NEXT: .reg .b64 %rd<4>; ; SM35-EMPTY: ; SM35-NEXT: // %bb.0: -; SM35-NEXT: ld.param.u64 %rd1, [fshr64_32_imm_param_0]; +; SM35-NEXT: ld.param.b64 %rd1, [fshr64_32_imm_param_0]; ; SM35-NEXT: mov.b64 {%r1, %r2}, %rd1; -; SM35-NEXT: ld.param.u64 %rd2, [fshr64_32_imm_param_1]; +; SM35-NEXT: ld.param.b64 %rd2, [fshr64_32_imm_param_1]; ; SM35-NEXT: mov.b64 {%r3, %r4}, %rd2; ; SM35-NEXT: mov.b64 %rd3, {%r4, %r1}; ; SM35-NEXT: st.param.b64 [func_retval0], %rd3; diff --git a/llvm/test/CodeGen/NVPTX/rotate_64.ll b/llvm/test/CodeGen/NVPTX/rotate_64.ll index 841dc67c6864..c91211a13fdf 100644 --- a/llvm/test/CodeGen/NVPTX/rotate_64.ll +++ b/llvm/test/CodeGen/NVPTX/rotate_64.ll @@ -12,7 +12,7 @@ define i64 @rotate64(i64 %a, i32 %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [rotate64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [rotate64_param_0]; ; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: shf.l.wrap.b32 %r3, %r1, %r2, 3; ; CHECK-NEXT: shf.l.wrap.b32 %r4, %r2, %r1, 3; @@ -30,7 +30,7 @@ define i64 @rotateright64(i64 %a, i32 %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [rotateright64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [rotateright64_param_0]; ; CHECK-NEXT: mov.b64 {%r1, %r2}, %rd1; ; CHECK-NEXT: shf.r.wrap.b32 %r3, %r2, %r1, 3; ; CHECK-NEXT: shf.r.wrap.b32 %r4, %r1, %r2, 3; diff --git a/llvm/test/CodeGen/NVPTX/sad-intrins.ll b/llvm/test/CodeGen/NVPTX/sad-intrins.ll index 8258dca605e9..bd80784f62f4 100644 --- a/llvm/test/CodeGen/NVPTX/sad-intrins.ll +++ b/llvm/test/CodeGen/NVPTX/sad-intrins.ll @@ -9,9 +9,9 @@ define i16 @test_sad_i16(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [test_sad_i16_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [test_sad_i16_param_1]; -; CHECK-NEXT: ld.param.u16 %rs3, [test_sad_i16_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_sad_i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [test_sad_i16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [test_sad_i16_param_2]; ; CHECK-NEXT: sad.s16 %rs4, %rs1, %rs2, %rs3; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -27,9 +27,9 @@ define i16 @test_sad_u16(i16 %x, i16 %y, i16 %z) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [test_sad_u16_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [test_sad_u16_param_1]; -; CHECK-NEXT: ld.param.u16 %rs3, [test_sad_u16_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [test_sad_u16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [test_sad_u16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [test_sad_u16_param_2]; ; CHECK-NEXT: sad.u16 %rs4, %rs1, %rs2, %rs3; ; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; @@ -44,9 +44,9 @@ define i32 @test_sad_i32(i32 %x, i32 %y, i32 %z) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_sad_i32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_sad_i32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_sad_i32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sad_i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_sad_i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_sad_i32_param_2]; ; CHECK-NEXT: sad.s32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -60,9 +60,9 @@ define i32 @test_sad_u32(i32 %x, i32 %y, i32 %z) { ; CHECK-NEXT: .reg .b32 %r<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_sad_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [test_sad_u32_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [test_sad_u32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [test_sad_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [test_sad_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [test_sad_u32_param_2]; ; CHECK-NEXT: sad.u32 %r4, %r1, %r2, %r3; ; CHECK-NEXT: st.param.b32 [func_retval0], %r4; ; CHECK-NEXT: ret; @@ -76,9 +76,9 @@ define i64 @test_sad_i64(i64 %x, i64 %y, i64 %z) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_sad_i64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_sad_i64_param_1]; -; CHECK-NEXT: ld.param.u64 %rd3, [test_sad_i64_param_2]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sad_i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_sad_i64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_sad_i64_param_2]; ; CHECK-NEXT: sad.s64 %rd4, %rd1, %rd2, %rd3; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; @@ -92,9 +92,9 @@ define i64 @test_sad_u64(i64 %x, i64 %y, i64 %z) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_sad_u64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_sad_u64_param_1]; -; CHECK-NEXT: ld.param.u64 %rd3, [test_sad_u64_param_2]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_sad_u64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_sad_u64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [test_sad_u64_param_2]; ; CHECK-NEXT: sad.u64 %rd4, %rd1, %rd2, %rd3; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd4; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/sched1.ll b/llvm/test/CodeGen/NVPTX/sched1.ll index e7358157ea54..09bd4243138a 100644 --- a/llvm/test/CodeGen/NVPTX/sched1.ll +++ b/llvm/test/CodeGen/NVPTX/sched1.ll @@ -5,10 +5,10 @@ define void @foo(ptr %a) { ; CHECK: .func foo -; CHECK: ld.u32 -; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 +; CHECK: ld.b32 +; CHECK-NEXT: ld.b32 +; CHECK-NEXT: ld.b32 +; CHECK-NEXT: ld.b32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 diff --git a/llvm/test/CodeGen/NVPTX/sched2.ll b/llvm/test/CodeGen/NVPTX/sched2.ll index 950f9f254042..551875231542 100644 --- a/llvm/test/CodeGen/NVPTX/sched2.ll +++ b/llvm/test/CodeGen/NVPTX/sched2.ll @@ -3,10 +3,10 @@ define void @foo(ptr %a) { ; CHECK: .func foo -; CHECK: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 +; CHECK: ld.v2.b32 +; CHECK-NEXT: ld.v2.b32 +; CHECK-NEXT: ld.v2.b32 +; CHECK-NEXT: ld.v2.b32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 diff --git a/llvm/test/CodeGen/NVPTX/sext-params.ll b/llvm/test/CodeGen/NVPTX/sext-params.ll index 0a502288c473..a8afcec759fe 100644 --- a/llvm/test/CodeGen/NVPTX/sext-params.ll +++ b/llvm/test/CodeGen/NVPTX/sext-params.ll @@ -11,7 +11,7 @@ define i8 @foo(i8 signext %a) { } define i8 @bar(i8 zeroext %a) { -; CHECK: ld.param.u8 +; CHECK: ld.param.b8 %ret = add i8 %a, 3 ret i8 %ret } diff --git a/llvm/test/CodeGen/NVPTX/sext-setcc.ll b/llvm/test/CodeGen/NVPTX/sext-setcc.ll index ba5291a6a95d..802954bda6a9 100644 --- a/llvm/test/CodeGen/NVPTX/sext-setcc.ll +++ b/llvm/test/CodeGen/NVPTX/sext-setcc.ll @@ -11,8 +11,8 @@ define <2 x i16> @sext_setcc_v2i1_to_v2i16(ptr %p) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v2i1_to_v2i16_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [sext_setcc_v2i1_to_v2i16_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; ; CHECK-NEXT: setp.eq.s16 %p1, %rs1, 0; ; CHECK-NEXT: setp.eq.s16 %p2, %rs2, 0; @@ -37,8 +37,8 @@ define <4 x i8> @sext_setcc_v4i1_to_v4i8(ptr %p) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: ld.param.u64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: ld.param.b64 %rd1, [sext_setcc_v4i1_to_v4i8_param_0]; +; CHECK-NEXT: ld.b32 %r1, [%rd1]; ; CHECK-NEXT: bfe.u32 %r2, %r1, 0, 8; ; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; ; CHECK-NEXT: and.b16 %rs2, %rs1, 255; diff --git a/llvm/test/CodeGen/NVPTX/shfl-p.ll b/llvm/test/CodeGen/NVPTX/shfl-p.ll index a631740cf36d..756998196fde 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-p.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-p.ll @@ -12,9 +12,9 @@ declare {float, i1} @llvm.nvvm.shfl.idx.f32p(float, i32, i32) ; CHECK-LABEL: .func{{.*}}shfl_i32_rrr define {i32, i1} @shfl_i32_rrr(i32 %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c) @@ -23,9 +23,9 @@ define {i32, i1} @shfl_i32_rrr(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_i32_irr define {i32, i1} @shfl_i32_irr(i32 %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 %c) @@ -34,8 +34,8 @@ define {i32, i1} @shfl_i32_irr(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_i32_rri define {i32, i1} @shfl_i32_rri(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 1) @@ -44,8 +44,8 @@ define {i32, i1} @shfl_i32_rri(i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_i32_iri define {i32, i1} @shfl_i32_iri(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 %b, i32 2) @@ -54,8 +54,8 @@ define {i32, i1} @shfl_i32_iri(i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_i32_rir define {i32, i1} @shfl_i32_rir(i32 %a, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 %c) @@ -64,8 +64,8 @@ define {i32, i1} @shfl_i32_rir(i32 %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_i32_iir define {i32, i1} @shfl_i32_iir(i32 %a, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 %c) @@ -74,7 +74,7 @@ define {i32, i1} @shfl_i32_iir(i32 %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_i32_rii define {i32, i1} @shfl_i32_rii(i32 %a) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 1, i32 2) @@ -83,7 +83,7 @@ define {i32, i1} @shfl_i32_rii(i32 %a) { ; CHECK-LABEL: .func{{.*}}shfl_i32_iii define {i32, i1} @shfl_i32_iii(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.down.i32p(i32 %a, i32 2, i32 3) @@ -94,9 +94,9 @@ define {i32, i1} @shfl_i32_iii(i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_f32_rrr define {float, i1} @shfl_f32_rrr(float %a, i32 %b, i32 %c) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c) @@ -105,9 +105,9 @@ define {float, i1} @shfl_f32_rrr(float %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_f32_irr define {float, i1} @shfl_f32_irr(float %a, i32 %b, i32 %c) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 %c) @@ -116,8 +116,8 @@ define {float, i1} @shfl_f32_irr(float %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_f32_rri define {float, i1} @shfl_f32_rri(float %a, i32 %b) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 1) @@ -126,8 +126,8 @@ define {float, i1} @shfl_f32_rri(float %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_f32_iri define {float, i1} @shfl_f32_iri(float %a, i32 %b) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 %b, i32 2) @@ -136,8 +136,8 @@ define {float, i1} @shfl_f32_iri(float %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_f32_rir define {float, i1} @shfl_f32_rir(float %a, i32 %c) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 %c) @@ -146,8 +146,8 @@ define {float, i1} @shfl_f32_rir(float %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_f32_iir define {float, i1} @shfl_f32_iir(float %a, i32 %c) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 %c) @@ -156,7 +156,7 @@ define {float, i1} @shfl_f32_iir(float %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_f32_rii define {float, i1} @shfl_f32_rii(float %a) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 1, i32 2) @@ -165,7 +165,7 @@ define {float, i1} @shfl_f32_rii(float %a) { ; CHECK-LABEL: .func{{.*}}shfl_f32_iii define {float, i1} @shfl_f32_iii(float %a, i32 %b) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.down.f32p(float %a, i32 2, i32 3) diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll index 20f5c571e9d2..74890dc4fed2 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync-p.ll @@ -12,10 +12,10 @@ declare {float, i1} @llvm.nvvm.shfl.sync.idx.f32p(i32, float, i32, i32) ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rrr define {i32, i1} @shfl_sync_i32_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 %c) @@ -24,9 +24,9 @@ define {i32, i1} @shfl_sync_i32_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_irr define {i32, i1} @shfl_sync_i32_irr(i32 %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 %c) @@ -35,9 +35,9 @@ define {i32, i1} @shfl_sync_i32_irr(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rri define {i32, i1} @shfl_sync_i32_rri(i32 %mask, i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 %b, i32 1) @@ -46,8 +46,8 @@ define {i32, i1} @shfl_sync_i32_rri(i32 %mask, i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iri define {i32, i1} @shfl_sync_i32_iri(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 %b, i32 2) @@ -56,9 +56,9 @@ define {i32, i1} @shfl_sync_i32_iri(i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rir define {i32, i1} @shfl_sync_i32_rir(i32 %mask, i32 %a, i32 %c) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 %c) @@ -67,8 +67,8 @@ define {i32, i1} @shfl_sync_i32_rir(i32 %mask, i32 %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iir define {i32, i1} @shfl_sync_i32_iir(i32 %a, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 %c) @@ -77,8 +77,8 @@ define {i32, i1} @shfl_sync_i32_iir(i32 %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_rii define {i32, i1} @shfl_sync_i32_rii(i32 %mask, i32 %a) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 %mask, i32 %a, i32 1, i32 2) @@ -87,7 +87,7 @@ define {i32, i1} @shfl_sync_i32_rii(i32 %mask, i32 %a) { ; CHECK-LABEL: .func{{.*}}shfl_sync_i32_iii define {i32, i1} @shfl_sync_i32_iii(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {i32, i1} @llvm.nvvm.shfl.sync.down.i32p(i32 1, i32 %a, i32 2, i32 3) @@ -98,10 +98,10 @@ define {i32, i1} @shfl_sync_i32_iii(i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rrr define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 %c) @@ -110,9 +110,9 @@ define {float, i1} @shfl_sync_f32_rrr(i32 %mask, float %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_irr define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], [[C]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 %c) @@ -121,9 +121,9 @@ define {float, i1} @shfl_sync_f32_irr(float %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rri define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 1, [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 %b, i32 1) @@ -132,8 +132,8 @@ define {float, i1} @shfl_sync_f32_rri(i32 %mask, float %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iri define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], [[B]], 2, 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 %b, i32 2) @@ -142,9 +142,9 @@ define {float, i1} @shfl_sync_f32_iri(float %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rir define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, [[C]], [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 %c) @@ -153,8 +153,8 @@ define {float, i1} @shfl_sync_f32_rir(i32 %mask, float %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iir define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, [[C]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 %c) @@ -163,8 +163,8 @@ define {float, i1} @shfl_sync_f32_iir(float %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_rii define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 1, 2, [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 %mask, float %a, i32 1, i32 2) @@ -173,7 +173,7 @@ define {float, i1} @shfl_sync_f32_rii(i32 %mask, float %a) { ; CHECK-LABEL: .func{{.*}}shfl_sync_f32_iii define {float, i1} @shfl_sync_f32_iii(float %a, i32 %b) { - ; CHECK: ld.param.f32 [[A:%f[0-9]+]] + ; CHECK: ld.param.b32 [[A:%f[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%f[0-9]+]]|[[OUTP:%p[0-9]+]], [[A]], 2, 3, 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call {float, i1} @llvm.nvvm.shfl.sync.down.f32p(i32 1, float %a, i32 2, i32 3) diff --git a/llvm/test/CodeGen/NVPTX/shfl-sync.ll b/llvm/test/CodeGen/NVPTX/shfl-sync.ll index a7e2932e61d3..0c826d221d05 100644 --- a/llvm/test/CodeGen/NVPTX/shfl-sync.ll +++ b/llvm/test/CodeGen/NVPTX/shfl-sync.ll @@ -12,10 +12,10 @@ declare float @llvm.nvvm.shfl.sync.idx.f32(float, i32, i32, i32) ; CHECK-LABEL: .func{{.*}}shfl_sync_rrr define i32 @shfl_sync_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 %c) @@ -24,9 +24,9 @@ define i32 @shfl_sync_rrr(i32 %mask, i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_irr define i32 @shfl_sync_irr(i32 %a, i32 %b, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], [[C]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 %c) @@ -35,9 +35,9 @@ define i32 @shfl_sync_irr(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_rri define i32 @shfl_sync_rri(i32 %mask, i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 1, [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 %b, i32 1) @@ -46,8 +46,8 @@ define i32 @shfl_sync_rri(i32 %mask, i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_iri define i32 @shfl_sync_iri(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[B:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[B:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], [[B]], 2, 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 %b, i32 2) @@ -56,9 +56,9 @@ define i32 @shfl_sync_iri(i32 %a, i32 %b) { ; CHECK-LABEL: .func{{.*}}shfl_sync_rir define i32 @shfl_sync_rir(i32 %mask, i32 %a, i32 %c) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, [[C]], [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 %c) @@ -67,8 +67,8 @@ define i32 @shfl_sync_rir(i32 %mask, i32 %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_iir define i32 @shfl_sync_iir(i32 %a, i32 %c) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] - ; CHECK: ld.param.u32 [[C:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[C:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, [[C]], 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 %c) @@ -77,8 +77,8 @@ define i32 @shfl_sync_iir(i32 %a, i32 %c) { ; CHECK-LABEL: .func{{.*}}shfl_sync_rii define i32 @shfl_sync_rii(i32 %mask, i32 %a) { - ; CHECK: ld.param.u32 [[MASK:%r[0-9]+]] - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[MASK:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 1, 2, [[MASK]]; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 %mask, i32 %a, i32 1, i32 2) @@ -87,7 +87,7 @@ define i32 @shfl_sync_rii(i32 %mask, i32 %a) { ; CHECK-LABEL: .func{{.*}}shfl_sync_iii define i32 @shfl_sync_iii(i32 %a, i32 %b) { - ; CHECK: ld.param.u32 [[A:%r[0-9]+]] + ; CHECK: ld.param.b32 [[A:%r[0-9]+]] ; CHECK: shfl.sync.down.b32 [[OUT:%r[0-9]+]], [[A]], 2, 3, 1; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 1, i32 %a, i32 2, i32 3) diff --git a/llvm/test/CodeGen/NVPTX/shfl.ll b/llvm/test/CodeGen/NVPTX/shfl.ll index fbf4ea4cd1c7..8aedba26b56b 100644 --- a/llvm/test/CodeGen/NVPTX/shfl.ll +++ b/llvm/test/CodeGen/NVPTX/shfl.ll @@ -15,7 +15,7 @@ declare float @llvm.nvvm.shfl.idx.f32(float, i32, i32) ; CHECK-LABEL: .func{{.*}}shfl_down1 define i32 @shfl_down1(i32 %in) { - ; CHECK: ld.param.u32 [[IN:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN:%r[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%r[0-9]+]], [[IN]], 1, 2; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 1, i32 2) @@ -24,8 +24,8 @@ define i32 @shfl_down1(i32 %in) { ; CHECK-LABEL: .func{{.*}}shfl_down2 define i32 @shfl_down2(i32 %in, i32 %width) { - ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]] - ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN1:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN2:%r[0-9]+]] ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], 3; %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 3) ret i32 %val @@ -33,8 +33,8 @@ define i32 @shfl_down2(i32 %in, i32 %width) { ; CHECK-LABEL: .func{{.*}}shfl_down3 define i32 @shfl_down3(i32 %in, i32 %mask) { - ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]] - ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN1:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN2:%r[0-9]+]] ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], 4, [[IN2]]; %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 4, i32 %mask) ret i32 %val @@ -42,9 +42,9 @@ define i32 @shfl_down3(i32 %in, i32 %mask) { ; CHECK-LABEL: .func{{.*}}shfl_down4 define i32 @shfl_down4(i32 %in, i32 %width, i32 %mask) { - ; CHECK: ld.param.u32 [[IN1:%r[0-9]+]] - ; CHECK: ld.param.u32 [[IN2:%r[0-9]+]] - ; CHECK: ld.param.u32 [[IN3:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN1:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN2:%r[0-9]+]] + ; CHECK: ld.param.b32 [[IN3:%r[0-9]+]] ; CHECK: shfl.down.{{.}}32 %r{{[0-9]+}}, [[IN1]], [[IN2]], [[IN3]]; %val = call i32 @llvm.nvvm.shfl.down.i32(i32 %in, i32 %width, i32 %mask) ret i32 %val @@ -53,7 +53,7 @@ define i32 @shfl_down4(i32 %in, i32 %width, i32 %mask) { ; Try shfl.down with floating-point params. ; CHECK-LABEL: .func{{.*}}shfl_down_float define float @shfl_down_float(float %in) { - ; CHECK: ld.param.f32 [[IN:%f[0-9]+]] + ; CHECK: ld.param.b32 [[IN:%f[0-9]+]] ; CHECK: shfl.down.b32 [[OUT:%f[0-9]+]], [[IN]], 5, 6; ; CHECK: st.param.{{.}}32 {{.*}}, [[OUT]] %out = call float @llvm.nvvm.shfl.down.f32(float %in, i32 5, i32 6) diff --git a/llvm/test/CodeGen/NVPTX/short-ptr.ll b/llvm/test/CodeGen/NVPTX/short-ptr.ll index 55828fa9ec80..eb058955e0aa 100644 --- a/llvm/test/CodeGen/NVPTX/short-ptr.ll +++ b/llvm/test/CodeGen/NVPTX/short-ptr.ll @@ -22,9 +22,9 @@ declare void @use(i8 %arg); ; CHECK-DEFAULT-32: .param .b32 test1_param_0 ; CHECK-SHORT-LOCAL: .param .b32 test1_param_0 define void @test1(ptr addrspace(5) %local) { - ; CHECK-DEFAULT: ld.param.u64 %rd{{.*}}, [test1_param_0]; - ; CHECK-DEFAULT-32: ld.param.u32 %r{{.*}}, [test1_param_0]; - ; CHECK-SHORT-LOCAL: ld.param.u32 %r{{.*}}, [test1_param_0]; + ; CHECK-DEFAULT: ld.param.b64 %rd{{.*}}, [test1_param_0]; + ; CHECK-DEFAULT-32: ld.param.b32 %r{{.*}}, [test1_param_0]; + ; CHECK-SHORT-LOCAL: ld.param.b32 %r{{.*}}, [test1_param_0]; %v = load i8, ptr addrspace(5) %local call void @use(i8 %v) ret void diff --git a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll index a6a286e608ce..d79029f124d8 100644 --- a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll +++ b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll @@ -9,14 +9,14 @@ define void @kernel_func(ptr %in.vec, ptr %out.vec0) nounwind { ; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [kernel_func_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [kernel_func_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r2, %r3, %r4, %r5}, [%r1]; ; CHECK-NEXT: ld.v4.b32 {%r6, %r7, %r8, %r9}, [%r1+16]; -; CHECK-NEXT: ld.param.u32 %r10, [kernel_func_param_1]; +; CHECK-NEXT: ld.param.b32 %r10, [kernel_func_param_1]; ; CHECK-NEXT: prmt.b32 %r11, %r6, %r8, 0x4000U; ; CHECK-NEXT: prmt.b32 %r12, %r2, %r4, 0x40U; ; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x7610U; -; CHECK-NEXT: st.u32 [%r10], %r13; +; CHECK-NEXT: st.b32 [%r10], %r13; ; CHECK-NEXT: ret; %wide.vec = load <32 x i8>, ptr %in.vec, align 64 %vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <4 x i32> diff --git a/llvm/test/CodeGen/NVPTX/st-addrspace.ll b/llvm/test/CodeGen/NVPTX/st-addrspace.ll index daccaaf57d52..d2b3f2b61ffb 100644 --- a/llvm/test/CodeGen/NVPTX/st-addrspace.ll +++ b/llvm/test/CodeGen/NVPTX/st-addrspace.ll @@ -8,24 +8,24 @@ ;; i8 ; ALL-LABEL: st_global_i8 define void @st_global_i8(ptr addrspace(1) %ptr, i8 %a) { -; G32: st.global.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; G64: st.global.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; G32: st.global.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; G64: st.global.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; ALL: ret store i8 %a, ptr addrspace(1) %ptr ret void } ; ALL-LABEL: st_shared_i8 define void @st_shared_i8(ptr addrspace(3) %ptr, i8 %a) { -; LS32: st.shared.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; LS64: st.shared.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; LS32: st.shared.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.shared.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; ALL: ret store i8 %a, ptr addrspace(3) %ptr ret void } ; ALL-LABEL: st_local_i8 define void @st_local_i8(ptr addrspace(5) %ptr, i8 %a) { -; LS32: st.local.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; LS64: st.local.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; LS32: st.local.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.local.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; ALL: ret store i8 %a, ptr addrspace(5) %ptr ret void @@ -34,24 +34,24 @@ define void @st_local_i8(ptr addrspace(5) %ptr, i8 %a) { ;; i16 ; ALL-LABEL: st_global_i16 define void @st_global_i16(ptr addrspace(1) %ptr, i16 %a) { -; G32: st.global.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; G64: st.global.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; G32: st.global.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; G64: st.global.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; ALL: ret store i16 %a, ptr addrspace(1) %ptr ret void } ; ALL-LABEL: st_shared_i16 define void @st_shared_i16(ptr addrspace(3) %ptr, i16 %a) { -; LS32: st.shared.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; LS64: st.shared.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; LS32: st.shared.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.shared.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; ALL: ret store i16 %a, ptr addrspace(3) %ptr ret void } ; ALL-LABEL: st_local_i16 define void @st_local_i16(ptr addrspace(5) %ptr, i16 %a) { -; LS32: st.local.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} -; LS64: st.local.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; LS32: st.local.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; LS64: st.local.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; ALL: ret store i16 %a, ptr addrspace(5) %ptr ret void @@ -60,24 +60,24 @@ define void @st_local_i16(ptr addrspace(5) %ptr, i16 %a) { ;; i32 ; ALL-LABEL: st_global_i32 define void @st_global_i32(ptr addrspace(1) %ptr, i32 %a) { -; G32: st.global.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} -; G64: st.global.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; G32: st.global.b32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; G64: st.global.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; ALL: ret store i32 %a, ptr addrspace(1) %ptr ret void } ; ALL-LABEL: st_shared_i32 define void @st_shared_i32(ptr addrspace(3) %ptr, i32 %a) { -; LS32: st.shared.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} -; LS64: st.shared.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; LS32: st.shared.b32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; LS64: st.shared.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, ptr addrspace(3) %ptr ret void } ; ALL-LABEL: st_local_i32 define void @st_local_i32(ptr addrspace(5) %ptr, i32 %a) { -; LS32: st.local.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} -; LS64: st.local.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; LS32: st.local.b32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; LS64: st.local.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; ALL: ret store i32 %a, ptr addrspace(5) %ptr ret void @@ -86,24 +86,24 @@ define void @st_local_i32(ptr addrspace(5) %ptr, i32 %a) { ;; i64 ; ALL-LABEL: st_global_i64 define void @st_global_i64(ptr addrspace(1) %ptr, i64 %a) { -; G32: st.global.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} -; G64: st.global.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; G32: st.global.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; G64: st.global.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; ALL: ret store i64 %a, ptr addrspace(1) %ptr ret void } ; ALL-LABEL: st_shared_i64 define void @st_shared_i64(ptr addrspace(3) %ptr, i64 %a) { -; LS32: st.shared.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} -; LS64: st.shared.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; LS32: st.shared.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; LS64: st.shared.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; ALL: ret store i64 %a, ptr addrspace(3) %ptr ret void } ; ALL-LABEL: st_local_i64 define void @st_local_i64(ptr addrspace(5) %ptr, i64 %a) { -; LS32: st.local.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} -; LS64: st.local.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; LS32: st.local.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; LS64: st.local.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; ALL: ret store i64 %a, ptr addrspace(5) %ptr ret void @@ -112,24 +112,24 @@ define void @st_local_i64(ptr addrspace(5) %ptr, i64 %a) { ;; f32 ; ALL-LABEL: st_global_f32 define void @st_global_f32(ptr addrspace(1) %ptr, float %a) { -; G32: st.global.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} -; G64: st.global.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; G32: st.global.b32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; G64: st.global.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; ALL: ret store float %a, ptr addrspace(1) %ptr ret void } ; ALL-LABEL: st_shared_f32 define void @st_shared_f32(ptr addrspace(3) %ptr, float %a) { -; LS32: st.shared.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} -; LS64: st.shared.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; LS32: st.shared.b32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; LS64: st.shared.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; ALL: ret store float %a, ptr addrspace(3) %ptr ret void } ; ALL-LABEL: st_local_f32 define void @st_local_f32(ptr addrspace(5) %ptr, float %a) { -; LS32: st.local.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} -; LS64: st.local.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; LS32: st.local.b32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; LS64: st.local.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; ALL: ret store float %a, ptr addrspace(5) %ptr ret void @@ -138,24 +138,24 @@ define void @st_local_f32(ptr addrspace(5) %ptr, float %a) { ;; f64 ; ALL-LABEL: st_global_f64 define void @st_global_f64(ptr addrspace(1) %ptr, double %a) { -; G32: st.global.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} -; G64: st.global.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; G32: st.global.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; G64: st.global.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; ALL: ret store double %a, ptr addrspace(1) %ptr ret void } ; ALL-LABEL: st_shared_f64 define void @st_shared_f64(ptr addrspace(3) %ptr, double %a) { -; LS32: st.shared.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} -; LS64: st.shared.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; LS32: st.shared.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; LS64: st.shared.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; ALL: ret store double %a, ptr addrspace(3) %ptr ret void } ; ALL-LABEL: st_local_f64 define void @st_local_f64(ptr addrspace(5) %ptr, double %a) { -; LS32: st.local.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} -; LS64: st.local.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; LS32: st.local.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; LS64: st.local.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; ALL: ret store double %a, ptr addrspace(5) %ptr ret void diff --git a/llvm/test/CodeGen/NVPTX/st-generic.ll b/llvm/test/CodeGen/NVPTX/st-generic.ll index c5062ed10e79..cdf9dba82551 100644 --- a/llvm/test/CodeGen/NVPTX/st-generic.ll +++ b/llvm/test/CodeGen/NVPTX/st-generic.ll @@ -6,9 +6,9 @@ ;; i8 define void @st_global_i8(ptr addrspace(0) %ptr, i8 %a) { -; PTX32: st.u8 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; PTX32: st.b8 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.b8 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i8 %a, ptr addrspace(0) %ptr ret void @@ -17,9 +17,9 @@ define void @st_global_i8(ptr addrspace(0) %ptr, i8 %a) { ;; i16 define void @st_global_i16(ptr addrspace(0) %ptr, i16 %a) { -; PTX32: st.u16 [%r{{[0-9]+}}], %rs{{[0-9]+}} +; PTX32: st.b16 [%r{{[0-9]+}}], %rs{{[0-9]+}} ; PTX32: ret -; PTX64: st.u16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} +; PTX64: st.b16 [%rd{{[0-9]+}}], %rs{{[0-9]+}} ; PTX64: ret store i16 %a, ptr addrspace(0) %ptr ret void @@ -28,9 +28,9 @@ define void @st_global_i16(ptr addrspace(0) %ptr, i16 %a) { ;; i32 define void @st_global_i32(ptr addrspace(0) %ptr, i32 %a) { -; PTX32: st.u32 [%r{{[0-9]+}}], %r{{[0-9]+}} +; PTX32: st.b32 [%r{{[0-9]+}}], %r{{[0-9]+}} ; PTX32: ret -; PTX64: st.u32 [%rd{{[0-9]+}}], %r{{[0-9]+}} +; PTX64: st.b32 [%rd{{[0-9]+}}], %r{{[0-9]+}} ; PTX64: ret store i32 %a, ptr addrspace(0) %ptr ret void @@ -39,9 +39,9 @@ define void @st_global_i32(ptr addrspace(0) %ptr, i32 %a) { ;; i64 define void @st_global_i64(ptr addrspace(0) %ptr, i64 %a) { -; PTX32: st.u64 [%r{{[0-9]+}}], %rd{{[0-9]+}} +; PTX32: st.b64 [%r{{[0-9]+}}], %rd{{[0-9]+}} ; PTX32: ret -; PTX64: st.u64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} +; PTX64: st.b64 [%rd{{[0-9]+}}], %rd{{[0-9]+}} ; PTX64: ret store i64 %a, ptr addrspace(0) %ptr ret void @@ -50,9 +50,9 @@ define void @st_global_i64(ptr addrspace(0) %ptr, i64 %a) { ;; f32 define void @st_global_f32(ptr addrspace(0) %ptr, float %a) { -; PTX32: st.f32 [%r{{[0-9]+}}], %f{{[0-9]+}} +; PTX32: st.b32 [%r{{[0-9]+}}], %f{{[0-9]+}} ; PTX32: ret -; PTX64: st.f32 [%rd{{[0-9]+}}], %f{{[0-9]+}} +; PTX64: st.b32 [%rd{{[0-9]+}}], %f{{[0-9]+}} ; PTX64: ret store float %a, ptr addrspace(0) %ptr ret void @@ -61,9 +61,9 @@ define void @st_global_f32(ptr addrspace(0) %ptr, float %a) { ;; f64 define void @st_global_f64(ptr addrspace(0) %ptr, double %a) { -; PTX32: st.f64 [%r{{[0-9]+}}], %fd{{[0-9]+}} +; PTX32: st.b64 [%r{{[0-9]+}}], %fd{{[0-9]+}} ; PTX32: ret -; PTX64: st.f64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} +; PTX64: st.b64 [%rd{{[0-9]+}}], %fd{{[0-9]+}} ; PTX64: ret store double %a, ptr addrspace(0) %ptr ret void diff --git a/llvm/test/CodeGen/NVPTX/st-param-imm.ll b/llvm/test/CodeGen/NVPTX/st-param-imm.ll index 0e67e52d52da..5f1ea5d7b1e2 100644 --- a/llvm/test/CodeGen/NVPTX/st-param-imm.ll +++ b/llvm/test/CodeGen/NVPTX/st-param-imm.ll @@ -87,7 +87,7 @@ define void @st_param_f32() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 3, 0 ; CHECK-NEXT: .param .b32 param0; -; CHECK-NEXT: st.param.f32 [param0], 0f40A00000; +; CHECK-NEXT: st.param.b32 [param0], 0f40A00000; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_f32, ; CHECK-NEXT: ( @@ -107,7 +107,7 @@ define void @st_param_f64() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 4, 0 ; CHECK-NEXT: .param .b64 param0; -; CHECK-NEXT: st.param.f64 [param0], 0d4018000000000000; +; CHECK-NEXT: st.param.b64 [param0], 0d4018000000000000; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_f64, ; CHECK-NEXT: ( @@ -150,7 +150,7 @@ define void @st_param_v2_i8_ir(i8 %val) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v2_i8_ir_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v2_i8_ir_param_0]; ; CHECK-NEXT: { // callseq 6, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[2]; ; CHECK-NEXT: st.param.v2.b8 [param0], {1, %rs1}; @@ -172,7 +172,7 @@ define void @st_param_v2_i8_ri(i8 %val) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v2_i8_ri_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v2_i8_ri_param_0]; ; CHECK-NEXT: { // callseq 7, 0 ; CHECK-NEXT: .param .align 2 .b8 param0[2]; ; CHECK-NEXT: st.param.v2.b8 [param0], {%rs1, 2}; @@ -214,7 +214,7 @@ define void @st_param_v2_i16_ir(i16 %val) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v2_i16_ir_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v2_i16_ir_param_0]; ; CHECK-NEXT: { // callseq 9, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v2.b16 [param0], {1, %rs1}; @@ -236,7 +236,7 @@ define void @st_param_v2_i16_ri(i16 %val) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v2_i16_ri_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v2_i16_ri_param_0]; ; CHECK-NEXT: { // callseq 10, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v2.b16 [param0], {%rs1, 2}; @@ -278,7 +278,7 @@ define void @st_param_v2_i32_ir(i32 %val) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v2_i32_ir_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v2_i32_ir_param_0]; ; CHECK-NEXT: { // callseq 12, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v2.b32 [param0], {1, %r1}; @@ -300,7 +300,7 @@ define void @st_param_v2_i32_ri(i32 %val) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v2_i32_ri_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v2_i32_ri_param_0]; ; CHECK-NEXT: { // callseq 13, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v2.b32 [param0], {%r1, 2}; @@ -342,7 +342,7 @@ define void @st_param_v2_i64_ir(i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [st_param_v2_i64_ir_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [st_param_v2_i64_ir_param_0]; ; CHECK-NEXT: { // callseq 15, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v2.b64 [param0], {1, %rd1}; @@ -364,7 +364,7 @@ define void @st_param_v2_i64_ri(i64 %val) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [st_param_v2_i64_ri_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [st_param_v2_i64_ri_param_0]; ; CHECK-NEXT: { // callseq 16, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v2.b64 [param0], {%rd1, 2}; @@ -389,7 +389,7 @@ define void @st_param_v2_f32_ii(float %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 17, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0], {0f3F800000, 0f40000000}; +; CHECK-NEXT: st.param.v2.b32 [param0], {0f3F800000, 0f40000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f32, ; CHECK-NEXT: ( @@ -406,10 +406,10 @@ define void @st_param_v2_f32_ir(float %val) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v2_f32_ir_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v2_f32_ir_param_0]; ; CHECK-NEXT: { // callseq 18, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0], {0f3F800000, %f1}; +; CHECK-NEXT: st.param.v2.b32 [param0], {0f3F800000, %f1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f32, ; CHECK-NEXT: ( @@ -428,10 +428,10 @@ define void @st_param_v2_f32_ri(float %val) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v2_f32_ri_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v2_f32_ri_param_0]; ; CHECK-NEXT: { // callseq 19, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; -; CHECK-NEXT: st.param.v2.f32 [param0], {%f1, 0f40000000}; +; CHECK-NEXT: st.param.v2.b32 [param0], {%f1, 0f40000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f32, ; CHECK-NEXT: ( @@ -453,7 +453,7 @@ define void @st_param_v2_f64_ii(double %val) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 20, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.f64 [param0], {0d3FF0000000000000, 0d4000000000000000}; +; CHECK-NEXT: st.param.v2.b64 [param0], {0d3FF0000000000000, 0d4000000000000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f64, ; CHECK-NEXT: ( @@ -470,10 +470,10 @@ define void @st_param_v2_f64_ir(double %val) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [st_param_v2_f64_ir_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [st_param_v2_f64_ir_param_0]; ; CHECK-NEXT: { // callseq 21, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.f64 [param0], {0d3FF0000000000000, %fd1}; +; CHECK-NEXT: st.param.v2.b64 [param0], {0d3FF0000000000000, %fd1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f64, ; CHECK-NEXT: ( @@ -492,10 +492,10 @@ define void @st_param_v2_f64_ri(double %val) { ; CHECK-NEXT: .reg .b64 %fd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f64 %fd1, [st_param_v2_f64_ri_param_0]; +; CHECK-NEXT: ld.param.b64 %fd1, [st_param_v2_f64_ri_param_0]; ; CHECK-NEXT: { // callseq 22, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v2.f64 [param0], {%fd1, 0d4000000000000000}; +; CHECK-NEXT: st.param.v2.b64 [param0], {%fd1, 0d4000000000000000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v2_f64, ; CHECK-NEXT: ( @@ -541,9 +541,9 @@ define void @st_param_v4_i8_irrr(i8 %b, i8 %c, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_irrr_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_irrr_param_1]; -; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_irrr_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irrr_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irrr_param_1]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_irrr_param_2]; ; CHECK-NEXT: { // callseq 24, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, %rs3}; @@ -567,9 +567,9 @@ define void @st_param_v4_i8_rirr(i8 %a, i8 %c, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_rirr_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_rirr_param_1]; -; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_rirr_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rirr_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rirr_param_1]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rirr_param_2]; ; CHECK-NEXT: { // callseq 25, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, %rs3}; @@ -593,9 +593,9 @@ define void @st_param_v4_i8_rrir(i8 %a, i8 %b, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_rrir_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_rrir_param_1]; -; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_rrir_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrir_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrir_param_2]; ; CHECK-NEXT: { // callseq 26, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, %rs3}; @@ -619,9 +619,9 @@ define void @st_param_v4_i8_rrri(i8 %a, i8 %b, i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_rrri_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_rrri_param_1]; -; CHECK-NEXT: ld.param.u8 %rs3, [st_param_v4_i8_rrri_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrri_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs3, [st_param_v4_i8_rrri_param_2]; ; CHECK-NEXT: { // callseq 27, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, %rs3, 4}; @@ -645,8 +645,8 @@ define void @st_param_v4_i8_iirr(i8 %c, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_iirr_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_iirr_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iirr_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_iirr_param_1]; ; CHECK-NEXT: { // callseq 28, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, %rs2}; @@ -670,8 +670,8 @@ define void @st_param_v4_i8_irir(i8 %b, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_irir_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_irir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irir_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irir_param_1]; ; CHECK-NEXT: { // callseq 29, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, %rs2}; @@ -695,8 +695,8 @@ define void @st_param_v4_i8_irri(i8 %b, i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_irri_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_irri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irri_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_irri_param_1]; ; CHECK-NEXT: { // callseq 30, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, %rs2, 4}; @@ -720,8 +720,8 @@ define void @st_param_v4_i8_riir(i8 %a, i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_riir_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_riir_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riir_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riir_param_1]; ; CHECK-NEXT: { // callseq 31, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, %rs2}; @@ -745,8 +745,8 @@ define void @st_param_v4_i8_riri(i8 %a, i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_riri_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_riri_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riri_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_riri_param_1]; ; CHECK-NEXT: { // callseq 32, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, %rs2, 4}; @@ -770,8 +770,8 @@ define void @st_param_v4_i8_rrii(i8 %a, i8 %b) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_rrii_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [st_param_v4_i8_rrii_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_rrii_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [st_param_v4_i8_rrii_param_1]; ; CHECK-NEXT: { // callseq 33, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, %rs2, 3, 4}; @@ -795,7 +795,7 @@ define void @st_param_v4_i8_iiir(i8 %d) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_iiir_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iiir_param_0]; ; CHECK-NEXT: { // callseq 34, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, 3, %rs1}; @@ -819,7 +819,7 @@ define void @st_param_v4_i8_iiri(i8 %c) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_iiri_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_iiri_param_0]; ; CHECK-NEXT: { // callseq 35, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, 2, %rs1, 4}; @@ -843,7 +843,7 @@ define void @st_param_v4_i8_irii(i8 %b) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_irii_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_irii_param_0]; ; CHECK-NEXT: { // callseq 36, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {1, %rs1, 3, 4}; @@ -867,7 +867,7 @@ define void @st_param_v4_i8_riii(i8 %a) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [st_param_v4_i8_riii_param_0]; +; CHECK-NEXT: ld.param.b8 %rs1, [st_param_v4_i8_riii_param_0]; ; CHECK-NEXT: { // callseq 37, 0 ; CHECK-NEXT: .param .align 4 .b8 param0[4]; ; CHECK-NEXT: st.param.v4.b8 [param0], {%rs1, 2, 3, 4}; @@ -911,9 +911,9 @@ define void @st_param_v4_i16_irrr(i16 %b, i16 %c, i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_irrr_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_irrr_param_1]; -; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_irrr_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_irrr_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_irrr_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [st_param_v4_i16_irrr_param_2]; ; CHECK-NEXT: { // callseq 39, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, %rs2, %rs3}; @@ -937,9 +937,9 @@ define void @st_param_v4_i16_rirr(i16 %a, i16 %c, i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_rirr_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_rirr_param_1]; -; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_rirr_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_rirr_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_rirr_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [st_param_v4_i16_rirr_param_2]; ; CHECK-NEXT: { // callseq 40, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, %rs2, %rs3}; @@ -963,9 +963,9 @@ define void @st_param_v4_i16_rrir(i16 %a, i16 %b, i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_rrir_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_rrir_param_1]; -; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_rrir_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_rrir_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_rrir_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [st_param_v4_i16_rrir_param_2]; ; CHECK-NEXT: { // callseq 41, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, 3, %rs3}; @@ -989,9 +989,9 @@ define void @st_param_v4_i16_rrri(i16 %a, i16 %b, i16 %c) { ; CHECK-NEXT: .reg .b16 %rs<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_rrri_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_rrri_param_1]; -; CHECK-NEXT: ld.param.u16 %rs3, [st_param_v4_i16_rrri_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_rrri_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_rrri_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [st_param_v4_i16_rrri_param_2]; ; CHECK-NEXT: { // callseq 42, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, %rs3, 4}; @@ -1015,8 +1015,8 @@ define void @st_param_v4_i16_iirr(i16 %c, i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_iirr_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_iirr_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iirr_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_iirr_param_1]; ; CHECK-NEXT: { // callseq 43, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, %rs2}; @@ -1040,8 +1040,8 @@ define void @st_param_v4_i16_irir(i16 %b, i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_irir_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_irir_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_irir_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_irir_param_1]; ; CHECK-NEXT: { // callseq 44, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, 3, %rs2}; @@ -1065,8 +1065,8 @@ define void @st_param_v4_i16_irri(i16 %b, i16 %c) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_irri_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_irri_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_irri_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_irri_param_1]; ; CHECK-NEXT: { // callseq 45, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, %rs2, 4}; @@ -1090,8 +1090,8 @@ define void @st_param_v4_i16_riir(i16 %a, i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_riir_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_riir_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_riir_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_riir_param_1]; ; CHECK-NEXT: { // callseq 46, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, 3, %rs2}; @@ -1115,8 +1115,8 @@ define void @st_param_v4_i16_riri(i16 %a, i16 %c) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_riri_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_riri_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_riri_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_riri_param_1]; ; CHECK-NEXT: { // callseq 47, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, %rs2, 4}; @@ -1140,8 +1140,8 @@ define void @st_param_v4_i16_rrii(i16 %a, i16 %b) { ; CHECK-NEXT: .reg .b16 %rs<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_rrii_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [st_param_v4_i16_rrii_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_rrii_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [st_param_v4_i16_rrii_param_1]; ; CHECK-NEXT: { // callseq 48, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, %rs2, 3, 4}; @@ -1165,7 +1165,7 @@ define void @st_param_v4_i16_iiir(i16 %d) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_iiir_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iiir_param_0]; ; CHECK-NEXT: { // callseq 49, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, 3, %rs1}; @@ -1189,7 +1189,7 @@ define void @st_param_v4_i16_iiri(i16 %c) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_iiri_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_iiri_param_0]; ; CHECK-NEXT: { // callseq 50, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, 2, %rs1, 4}; @@ -1213,7 +1213,7 @@ define void @st_param_v4_i16_irii(i16 %b) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_irii_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_irii_param_0]; ; CHECK-NEXT: { // callseq 51, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {1, %rs1, 3, 4}; @@ -1237,7 +1237,7 @@ define void @st_param_v4_i16_riii(i16 %a) { ; CHECK-NEXT: .reg .b16 %rs<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [st_param_v4_i16_riii_param_0]; +; CHECK-NEXT: ld.param.b16 %rs1, [st_param_v4_i16_riii_param_0]; ; CHECK-NEXT: { // callseq 52, 0 ; CHECK-NEXT: .param .align 8 .b8 param0[8]; ; CHECK-NEXT: st.param.v4.b16 [param0], {%rs1, 2, 3, 4}; @@ -1281,9 +1281,9 @@ define void @st_param_v4_i32_irrr(i32 %b, i32 %c, i32 %d) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_irrr_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_irrr_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_irrr_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_irrr_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_irrr_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [st_param_v4_i32_irrr_param_2]; ; CHECK-NEXT: { // callseq 54, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, %r2, %r3}; @@ -1307,9 +1307,9 @@ define void @st_param_v4_i32_rirr(i32 %a, i32 %c, i32 %d) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_rirr_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_rirr_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_rirr_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_rirr_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_rirr_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [st_param_v4_i32_rirr_param_2]; ; CHECK-NEXT: { // callseq 55, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, %r2, %r3}; @@ -1333,9 +1333,9 @@ define void @st_param_v4_i32_rrir(i32 %a, i32 %b, i32 %d) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_rrir_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_rrir_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_rrir_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_rrir_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_rrir_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [st_param_v4_i32_rrir_param_2]; ; CHECK-NEXT: { // callseq 56, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, 3, %r3}; @@ -1359,9 +1359,9 @@ define void @st_param_v4_i32_rrri(i32 %a, i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_rrri_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_rrri_param_1]; -; CHECK-NEXT: ld.param.u32 %r3, [st_param_v4_i32_rrri_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_rrri_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_rrri_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [st_param_v4_i32_rrri_param_2]; ; CHECK-NEXT: { // callseq 57, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, %r3, 4}; @@ -1385,8 +1385,8 @@ define void @st_param_v4_i32_iirr(i32 %c, i32 %d) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_iirr_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_iirr_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_iirr_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_iirr_param_1]; ; CHECK-NEXT: { // callseq 58, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, %r1, %r2}; @@ -1410,8 +1410,8 @@ define void @st_param_v4_i32_irir(i32 %b, i32 %d) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_irir_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_irir_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_irir_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_irir_param_1]; ; CHECK-NEXT: { // callseq 59, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, 3, %r2}; @@ -1435,8 +1435,8 @@ define void @st_param_v4_i32_irri(i32 %b, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_irri_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_irri_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_irri_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_irri_param_1]; ; CHECK-NEXT: { // callseq 60, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, %r2, 4}; @@ -1460,8 +1460,8 @@ define void @st_param_v4_i32_riir(i32 %a, i32 %d) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_riir_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_riir_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_riir_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_riir_param_1]; ; CHECK-NEXT: { // callseq 61, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, 3, %r2}; @@ -1485,8 +1485,8 @@ define void @st_param_v4_i32_riri(i32 %a, i32 %c) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_riri_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_riri_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_riri_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_riri_param_1]; ; CHECK-NEXT: { // callseq 62, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, %r2, 4}; @@ -1510,8 +1510,8 @@ define void @st_param_v4_i32_rrii(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_rrii_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [st_param_v4_i32_rrii_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_rrii_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [st_param_v4_i32_rrii_param_1]; ; CHECK-NEXT: { // callseq 63, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, %r2, 3, 4}; @@ -1535,7 +1535,7 @@ define void @st_param_v4_i32_iiir(i32 %d) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_iiir_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_iiir_param_0]; ; CHECK-NEXT: { // callseq 64, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, 3, %r1}; @@ -1559,7 +1559,7 @@ define void @st_param_v4_i32_iiri(i32 %c) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_iiri_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_iiri_param_0]; ; CHECK-NEXT: { // callseq 65, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, 2, %r1, 4}; @@ -1583,7 +1583,7 @@ define void @st_param_v4_i32_irii(i32 %b) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_irii_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_irii_param_0]; ; CHECK-NEXT: { // callseq 66, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {1, %r1, 3, 4}; @@ -1607,7 +1607,7 @@ define void @st_param_v4_i32_riii(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [st_param_v4_i32_riii_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [st_param_v4_i32_riii_param_0]; ; CHECK-NEXT: { // callseq 67, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; ; CHECK-NEXT: st.param.v4.b32 [param0], {%r1, 2, 3, 4}; @@ -1634,7 +1634,7 @@ define void @st_param_v4_f32_iiii() { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: { // callseq 68, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1651,12 +1651,12 @@ define void @st_param_v4_f32_irrr(float %b, float %c, float %d) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_irrr_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_irrr_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_irrr_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_irrr_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_irrr_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [st_param_v4_f32_irrr_param_2]; ; CHECK-NEXT: { // callseq 69, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, %f2, %f3}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, %f1, %f2, %f3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1677,12 +1677,12 @@ define void @st_param_v4_f32_rirr(float %a, float %c, float %d) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_rirr_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_rirr_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_rirr_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_rirr_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_rirr_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [st_param_v4_f32_rirr_param_2]; ; CHECK-NEXT: { // callseq 70, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, %f2, %f3}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, 0f40000000, %f2, %f3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1703,12 +1703,12 @@ define void @st_param_v4_f32_rrir(float %a, float %b, float %d) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_rrir_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_rrir_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_rrir_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_rrir_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_rrir_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [st_param_v4_f32_rrir_param_2]; ; CHECK-NEXT: { // callseq 71, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, %f2, 0f40400000, %f3}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, %f2, 0f40400000, %f3}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1729,12 +1729,12 @@ define void @st_param_v4_f32_rrri(float %a, float %b, float %c) { ; CHECK-NEXT: .reg .b32 %f<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_rrri_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_rrri_param_1]; -; CHECK-NEXT: ld.param.f32 %f3, [st_param_v4_f32_rrri_param_2]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_rrri_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_rrri_param_1]; +; CHECK-NEXT: ld.param.b32 %f3, [st_param_v4_f32_rrri_param_2]; ; CHECK-NEXT: { // callseq 72, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, %f2, %f3, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, %f2, %f3, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1755,11 +1755,11 @@ define void @st_param_v4_f32_iirr(float %c, float %d) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_iirr_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_iirr_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_iirr_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_iirr_param_1]; ; CHECK-NEXT: { // callseq 73, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, %f1, %f2}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %f1, %f2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1780,11 +1780,11 @@ define void @st_param_v4_f32_irir(float %b, float %d) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_irir_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_irir_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_irir_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_irir_param_1]; ; CHECK-NEXT: { // callseq 74, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, 0f40400000, %f2}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, %f1, 0f40400000, %f2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1805,11 +1805,11 @@ define void @st_param_v4_f32_irri(float %b, float %c) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_irri_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_irri_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_irri_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_irri_param_1]; ; CHECK-NEXT: { // callseq 75, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, %f2, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, %f1, %f2, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1830,11 +1830,11 @@ define void @st_param_v4_f32_riir(float %a, float %d) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_riir_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_riir_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_riir_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_riir_param_1]; ; CHECK-NEXT: { // callseq 76, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, 0f40400000, %f2}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, 0f40000000, 0f40400000, %f2}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1855,11 +1855,11 @@ define void @st_param_v4_f32_riri(float %a, float %c) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_riri_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_riri_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_riri_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_riri_param_1]; ; CHECK-NEXT: { // callseq 77, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, %f2, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, 0f40000000, %f2, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1880,11 +1880,11 @@ define void @st_param_v4_f32_rrii(float %a, float %b) { ; CHECK-NEXT: .reg .b32 %f<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_rrii_param_0]; -; CHECK-NEXT: ld.param.f32 %f2, [st_param_v4_f32_rrii_param_1]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_rrii_param_0]; +; CHECK-NEXT: ld.param.b32 %f2, [st_param_v4_f32_rrii_param_1]; ; CHECK-NEXT: { // callseq 78, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, %f2, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, %f2, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1905,10 +1905,10 @@ define void @st_param_v4_f32_iiir(float %d) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_iiir_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_iiir_param_0]; ; CHECK-NEXT: { // callseq 79, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, 0f40400000, %f1}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, 0f40000000, 0f40400000, %f1}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1929,10 +1929,10 @@ define void @st_param_v4_f32_iiri(float %c) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_iiri_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_iiri_param_0]; ; CHECK-NEXT: { // callseq 80, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, 0f40000000, %f1, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, 0f40000000, %f1, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1953,10 +1953,10 @@ define void @st_param_v4_f32_irii(float %b) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_irii_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_irii_param_0]; ; CHECK-NEXT: { // callseq 81, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {0f3F800000, %f1, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {0f3F800000, %f1, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( @@ -1977,10 +1977,10 @@ define void @st_param_v4_f32_riii(float %a) { ; CHECK-NEXT: .reg .b32 %f<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.f32 %f1, [st_param_v4_f32_riii_param_0]; +; CHECK-NEXT: ld.param.b32 %f1, [st_param_v4_f32_riii_param_0]; ; CHECK-NEXT: { // callseq 82, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[16]; -; CHECK-NEXT: st.param.v4.f32 [param0], {%f1, 0f40000000, 0f40400000, 0f40800000}; +; CHECK-NEXT: st.param.v4.b32 [param0], {%f1, 0f40000000, 0f40400000, 0f40800000}; ; CHECK-NEXT: call.uni ; CHECK-NEXT: call_v4_f32, ; CHECK-NEXT: ( diff --git a/llvm/test/CodeGen/NVPTX/st_bulk.ll b/llvm/test/CodeGen/NVPTX/st_bulk.ll index 785f78a6f951..944f221fb1af 100644 --- a/llvm/test/CodeGen/NVPTX/st_bulk.ll +++ b/llvm/test/CodeGen/NVPTX/st_bulk.ll @@ -11,8 +11,8 @@ define void @st_bulk(ptr %dest_addr, i64 %size) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [st_bulk_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [st_bulk_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [st_bulk_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [st_bulk_param_1]; ; CHECK-NEXT: st.bulk [%rd1], %rd2, 0; ; CHECK-NEXT: ret; call void @llvm.nvvm.st.bulk(ptr %dest_addr, i64 %size, i64 0) @@ -26,8 +26,8 @@ define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) { ; CHECK-PTX64-NEXT: .reg .b64 %rd<3>; ; CHECK-PTX64-EMPTY: ; CHECK-PTX64-NEXT: // %bb.0: -; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_0]; -; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [st_bulk_shared_cta_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [st_bulk_shared_cta_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [st_bulk_shared_cta_param_1]; ; CHECK-PTX64-NEXT: st.bulk.shared::cta [%rd1], %rd2, 0; ; CHECK-PTX64-NEXT: ret; ; @@ -37,8 +37,8 @@ define void @st_bulk_shared_cta(ptr addrspace(3) %dest_addr, i64 %size) { ; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK-PTX-SHARED32-EMPTY: ; CHECK-PTX-SHARED32-NEXT: // %bb.0: -; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [st_bulk_shared_cta_param_0]; -; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [st_bulk_shared_cta_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [st_bulk_shared_cta_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [st_bulk_shared_cta_param_1]; ; CHECK-PTX-SHARED32-NEXT: st.bulk.shared::cta [%r1], %rd1, 0; ; CHECK-PTX-SHARED32-NEXT: ret; call void @llvm.nvvm.st.bulk.shared.cta(ptr addrspace(3) %dest_addr, i64 %size, i64 0) diff --git a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll index e9169c942b8b..802ae26da41a 100644 --- a/llvm/test/CodeGen/NVPTX/stacksaverestore.ll +++ b/llvm/test/CodeGen/NVPTX/stacksaverestore.ll @@ -49,7 +49,7 @@ define void @test_restore(ptr %p) { ; CHECK-32-NEXT: .reg .b32 %r<3>; ; CHECK-32-EMPTY: ; CHECK-32-NEXT: // %bb.0: -; CHECK-32-NEXT: ld.param.u32 %r1, [test_restore_param_0]; +; CHECK-32-NEXT: ld.param.b32 %r1, [test_restore_param_0]; ; CHECK-32-NEXT: cvta.to.local.u32 %r2, %r1; ; CHECK-32-NEXT: stackrestore.u32 %r2; ; CHECK-32-NEXT: ret; @@ -59,7 +59,7 @@ define void @test_restore(ptr %p) { ; CHECK-64-NEXT: .reg .b64 %rd<3>; ; CHECK-64-EMPTY: ; CHECK-64-NEXT: // %bb.0: -; CHECK-64-NEXT: ld.param.u64 %rd1, [test_restore_param_0]; +; CHECK-64-NEXT: ld.param.b64 %rd1, [test_restore_param_0]; ; CHECK-64-NEXT: cvta.to.local.u64 %rd2, %rd1; ; CHECK-64-NEXT: stackrestore.u64 %rd2; ; CHECK-64-NEXT: ret; @@ -70,7 +70,7 @@ define void @test_restore(ptr %p) { ; CHECK-MIXED-NEXT: .reg .b64 %rd<3>; ; CHECK-MIXED-EMPTY: ; CHECK-MIXED-NEXT: // %bb.0: -; CHECK-MIXED-NEXT: ld.param.u64 %rd1, [test_restore_param_0]; +; CHECK-MIXED-NEXT: ld.param.b64 %rd1, [test_restore_param_0]; ; CHECK-MIXED-NEXT: cvta.to.local.u64 %rd2, %rd1; ; CHECK-MIXED-NEXT: cvt.u32.u64 %r1, %rd2; ; CHECK-MIXED-NEXT: stackrestore.u32 %r1; diff --git a/llvm/test/CodeGen/NVPTX/store-retval.ll b/llvm/test/CodeGen/NVPTX/store-retval.ll index 6a60c97b854b..3bb7c8381508 100644 --- a/llvm/test/CodeGen/NVPTX/store-retval.ll +++ b/llvm/test/CodeGen/NVPTX/store-retval.ll @@ -23,7 +23,7 @@ define %struct.StNoalign @func_StNoalign(ptr nocapture noundef readonly byval(%struct.StNoalign) align 4 %in) { ; CHECK-LABEL: .func{{.*}}func_StNoalign - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [func_StNoalign_param_0]; + ; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [func_StNoalign_param_0]; ; CHECK-NOT: st.param.b32 [func_retval0+0], %r{{[0-9]+}}; ; CHECK-NOT: st.param.b32 [func_retval0+4], %r{{[0-9]+}}; ; CHECK-NOT: st.param.b32 [func_retval0+8], %r{{[0-9]+}}; @@ -39,7 +39,7 @@ define %struct.StNoalign @func_StNoalign(ptr nocapture noundef readonly byval(%s define %struct.StAlign8 @func_StAlign8(ptr nocapture noundef readonly byval(%struct.StAlign8) align 8 %in) { ; CHECK-LABEL: .func{{.*}}func_StAlign8 - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [func_StAlign8_param_0]; + ; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [func_StAlign8_param_0]; ; CHECK-NOT: st.param.b32 [func_retval0+0], %r{{[0-9]+}}; ; CHECK-NOT: st.param.b32 [func_retval0+4], %r{{[0-9]+}}; ; CHECK-NOT: st.param.b32 [func_retval0+8], %r{{[0-9]+}}; @@ -56,7 +56,7 @@ define %struct.StAlign8 @func_StAlign8(ptr nocapture noundef readonly byval(%str define %struct.StAlign16 @func_StAlign16(ptr nocapture noundef readonly byval(%struct.StAlign16) align 16 %in) { ; CHECK-LABEL: .func{{.*}}func_StAlign16 - ; CHECK: ld.param.u32 [[R1:%r[0-9]+]], [func_StAlign16_param_0]; + ; CHECK: ld.param.b32 [[R1:%r[0-9]+]], [func_StAlign16_param_0]; ; CHECK-NOT: st.param.b32 [func_retval0+0], %r{{[0-9]+}}; ; CHECK-NOT: st.param.b32 [func_retval0+4], %r{{[0-9]+}}; ; CHECK-NOT: st.param.b32 [func_retval0+8], %r{{[0-9]+}}; diff --git a/llvm/test/CodeGen/NVPTX/store-undef.ll b/llvm/test/CodeGen/NVPTX/store-undef.ll index d871331cf1ec..52415b05e03d 100644 --- a/llvm/test/CodeGen/NVPTX/store-undef.ll +++ b/llvm/test/CodeGen/NVPTX/store-undef.ll @@ -34,8 +34,8 @@ define void @test_store_param_def(i64 %param0, i32 %param1) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_store_param_def_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [test_store_param_def_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_store_param_def_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_store_param_def_param_1]; ; CHECK-NEXT: { // callseq 1, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; ; CHECK-NEXT: st.param.b64 [param0], %rd1; @@ -75,12 +75,12 @@ define void @test_store_def(i64 %param0, i32 %param1, ptr %out) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_store_def_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [test_store_def_param_1]; -; CHECK-NEXT: ld.param.u64 %rd2, [test_store_def_param_2]; -; CHECK-NEXT: st.v4.u32 [%rd2+16], {%r2, %r1, %r3, %r4}; -; CHECK-NEXT: st.v2.u32 [%rd2+8], {%r5, %r1}; -; CHECK-NEXT: st.u64 [%rd2], %rd1; +; CHECK-NEXT: ld.param.b64 %rd1, [test_store_def_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_store_def_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [test_store_def_param_2]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r2, %r1, %r3, %r4}; +; CHECK-NEXT: st.v2.b32 [%rd2+8], {%r5, %r1}; +; CHECK-NEXT: st.b64 [%rd2], %rd1; ; CHECK-NEXT: ret; %V2 = insertelement <2 x i32> undef, i32 %param1, i32 1 %V4 = insertelement <4 x i32> undef, i32 %param1, i32 1 @@ -98,16 +98,16 @@ define void @test_store_volatile_undef(ptr %out, <8 x i32> %vec) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_undef_param_0]; -; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: st.volatile.v2.u32 [%rd1+8], {%r5, %r6}; -; CHECK-NEXT: st.volatile.u64 [%rd1], %rd2; -; CHECK-NEXT: ld.param.v4.u32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1]; -; CHECK-NEXT: ld.param.v4.u32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16]; -; CHECK-NEXT: st.volatile.v4.u32 [%rd3], {%r11, %r12, %r13, %r14}; -; CHECK-NEXT: st.volatile.v4.u32 [%rd4], {%r7, %r8, %r9, %r10}; -; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r15, %r16, %r17, %r18}; -; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r19, %r20, %r21, %r22}; +; CHECK-NEXT: ld.param.b64 %rd1, [test_store_volatile_undef_param_0]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1+16], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: st.volatile.v2.b32 [%rd1+8], {%r5, %r6}; +; CHECK-NEXT: st.volatile.b64 [%rd1], %rd2; +; CHECK-NEXT: ld.param.v4.b32 {%r7, %r8, %r9, %r10}, [test_store_volatile_undef_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r11, %r12, %r13, %r14}, [test_store_volatile_undef_param_1+16]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd3], {%r11, %r12, %r13, %r14}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd4], {%r7, %r8, %r9, %r10}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1+16], {%r15, %r16, %r17, %r18}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r19, %r20, %r21, %r22}; ; CHECK-NEXT: ret; store volatile %struct.T undef, ptr %out store volatile <8 x i32> %vec, ptr undef @@ -122,10 +122,10 @@ define void @test_store_volatile_of_poison(ptr %out) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_of_poison_param_0]; -; CHECK-NEXT: st.volatile.v4.u32 [%rd1+16], {%r1, %r2, %r3, %r4}; -; CHECK-NEXT: st.volatile.v2.u32 [%rd1+8], {%r5, %r6}; -; CHECK-NEXT: st.volatile.u64 [%rd1], %rd2; +; CHECK-NEXT: ld.param.b64 %rd1, [test_store_volatile_of_poison_param_0]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1+16], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: st.volatile.v2.b32 [%rd1+8], {%r5, %r6}; +; CHECK-NEXT: st.volatile.b64 [%rd1], %rd2; ; CHECK-NEXT: ret; store volatile %struct.T poison, ptr %out ret void @@ -138,12 +138,12 @@ define void @test_store_volatile_to_poison(%struct.T %param) { ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [test_store_volatile_to_poison_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8]; -; CHECK-NEXT: ld.param.v4.u32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16]; -; CHECK-NEXT: st.volatile.v4.u32 [%rd2], {%r3, %r4, %r5, %r6}; -; CHECK-NEXT: st.volatile.v2.u32 [%rd3], {%r1, %r2}; -; CHECK-NEXT: st.volatile.u64 [%rd4], %rd1; +; CHECK-NEXT: ld.param.b64 %rd1, [test_store_volatile_to_poison_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [test_store_volatile_to_poison_param_0+8]; +; CHECK-NEXT: ld.param.v4.b32 {%r3, %r4, %r5, %r6}, [test_store_volatile_to_poison_param_0+16]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r3, %r4, %r5, %r6}; +; CHECK-NEXT: st.volatile.v2.b32 [%rd3], {%r1, %r2}; +; CHECK-NEXT: st.volatile.b64 [%rd4], %rd1; ; CHECK-NEXT: ret; store volatile %struct.T %param, ptr poison ret void diff --git a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll index 3afff3245fbf..ae74bbb866eb 100644 --- a/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-read-cuda.ll @@ -18,13 +18,13 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [foo_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [foo_param_1]; ; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; -; CHECK-NEXT: ld.param.u32 %r1, [foo_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [foo_param_2]; ; CHECK-NEXT: suld.b.1d.b32.trap {%r2}, [%rd1, {%r1}]; ; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2; -; CHECK-NEXT: st.global.f32 [%rd3], %f1; +; CHECK-NEXT: st.global.b32 [%rd3], %f1; ; CHECK-NEXT: ret; %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx) %ret = sitofp i32 %val to float @@ -42,12 +42,12 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [bar_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [bar_param_0]; ; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; -; CHECK-NEXT: ld.param.u32 %r1, [bar_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [bar_param_1]; ; CHECK-NEXT: suld.b.1d.b32.trap {%r2}, [surf0, {%r1}]; ; CHECK-NEXT: cvt.rn.f32.s32 %f1, %r2; -; CHECK-NEXT: st.global.f32 [%rd2], %f1; +; CHECK-NEXT: st.global.b32 [%rd2], %f1; ; CHECK-NEXT: ret; %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0) %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %surfHandle, i32 %idx) diff --git a/llvm/test/CodeGen/NVPTX/surf-read.ll b/llvm/test/CodeGen/NVPTX/surf-read.ll index 3166622f613c..8dee5250920e 100644 --- a/llvm/test/CodeGen/NVPTX/surf-read.ll +++ b/llvm/test/CodeGen/NVPTX/surf-read.ll @@ -12,7 +12,7 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) { %val = tail call i32 @llvm.nvvm.suld.1d.i32.trap(i64 %img, i32 %idx) ; CHECK: cvt.rn.f32.s32 %f[[REDF:[0-9]+]], %r[[RED]] %ret = sitofp i32 %val to float -; CHECK: st.f32 [%rd{{[0-9]+}}], %f[[REDF]] +; CHECK: st.b32 [%rd{{[0-9]+}}], %f[[REDF]] store float %ret, ptr %red ret void } diff --git a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll index 5dc44cb1925b..abc2ea89b62c 100644 --- a/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/surf-write-cuda.ll @@ -17,9 +17,9 @@ define ptx_kernel void @foo(i64 %img, i32 %val, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; -; CHECK-NEXT: ld.param.u32 %r1, [foo_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [foo_param_2]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [foo_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [foo_param_2]; ; CHECK-NEXT: sust.b.1d.b32.trap [%rd1, {%r2}], {%r1}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.sust.b.1d.i32.trap(i64 %img, i32 %idx, i32 %val) @@ -37,8 +37,8 @@ define ptx_kernel void @bar(i32 %val, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [bar_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [bar_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [bar_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [bar_param_1]; ; CHECK-NEXT: sust.b.1d.b32.trap [surf0, {%r2}], {%r1}; ; CHECK-NEXT: ret; %surfHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @surf0) diff --git a/llvm/test/CodeGen/NVPTX/szext.ll b/llvm/test/CodeGen/NVPTX/szext.ll index f159156c6b80..5a4fe4ed7fc0 100644 --- a/llvm/test/CodeGen/NVPTX/szext.ll +++ b/llvm/test/CodeGen/NVPTX/szext.ll @@ -10,8 +10,8 @@ define i32 @szext_wrap_u32(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [szext_wrap_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [szext_wrap_u32_param_1]; ; CHECK-NEXT: szext.wrap.u32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -25,8 +25,8 @@ define i32 @szext_clamp_u32(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_u32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [szext_clamp_u32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [szext_clamp_u32_param_1]; ; CHECK-NEXT: szext.clamp.u32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -40,8 +40,8 @@ define i32 @szext_wrap_s32(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [szext_wrap_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [szext_wrap_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [szext_wrap_s32_param_1]; ; CHECK-NEXT: szext.wrap.s32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -55,8 +55,8 @@ define i32 @szext_clamp_s32(i32 %a, i32 %b) { ; CHECK-NEXT: .reg .b32 %r<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_s32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [szext_clamp_s32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [szext_clamp_s32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [szext_clamp_s32_param_1]; ; CHECK-NEXT: szext.clamp.s32 %r3, %r1, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r3; ; CHECK-NEXT: ret; @@ -84,7 +84,7 @@ define i32 @szext_wrap_s32_ir(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [szext_wrap_s32_ir_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [szext_wrap_s32_ir_param_0]; ; CHECK-NEXT: szext.wrap.s32 %r2, 5, %r1; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; @@ -98,7 +98,7 @@ define i32 @szext_clamp_u32_ri(i32 %a) { ; CHECK-NEXT: .reg .b32 %r<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [szext_clamp_u32_ri_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [szext_clamp_u32_ri_param_0]; ; CHECK-NEXT: szext.clamp.u32 %r2, %r1, 7; ; CHECK-NEXT: st.param.b32 [func_retval0], %r2; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll index 26967faa01a1..39cd054716b5 100644 --- a/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll +++ b/llvm/test/CodeGen/NVPTX/tag-invariant-loads.ll @@ -18,12 +18,12 @@ define ptx_kernel void @basic(ptr noalias readonly %a, ptr %out) { ; PTX-NEXT: .reg .b32 %f<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u32 %r1, [basic_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [basic_param_0]; ; PTX-NEXT: cvta.to.global.u32 %r2, %r1; -; PTX-NEXT: ld.param.u32 %r3, [basic_param_1]; +; PTX-NEXT: ld.param.b32 %r3, [basic_param_1]; ; PTX-NEXT: cvta.to.global.u32 %r4, %r3; -; PTX-NEXT: ld.global.nc.f32 %f1, [%r2]; -; PTX-NEXT: st.global.f32 [%r4], %f1; +; PTX-NEXT: ld.global.nc.b32 %f1, [%r2]; +; PTX-NEXT: st.global.b32 [%r4], %f1; ; PTX-NEXT: ret; %a_global = addrspacecast ptr %a to ptr addrspace(1) %val = load float, ptr addrspace(1) %a_global @@ -47,18 +47,18 @@ define ptx_kernel void @select(ptr noalias readonly %a, ptr noalias readonly %b, ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u8 %rs1, [select_param_2]; +; PTX-NEXT: ld.param.b8 %rs1, [select_param_2]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX-NEXT: ld.param.u32 %r1, [select_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [select_param_0]; ; PTX-NEXT: cvta.to.global.u32 %r2, %r1; -; PTX-NEXT: ld.param.u32 %r3, [select_param_1]; +; PTX-NEXT: ld.param.b32 %r3, [select_param_1]; ; PTX-NEXT: cvta.to.global.u32 %r4, %r3; -; PTX-NEXT: ld.param.u32 %r5, [select_param_3]; +; PTX-NEXT: ld.param.b32 %r5, [select_param_3]; ; PTX-NEXT: cvta.to.global.u32 %r6, %r5; ; PTX-NEXT: selp.b32 %r7, %r2, %r4, %p1; -; PTX-NEXT: ld.global.nc.u32 %r8, [%r7]; -; PTX-NEXT: st.global.u32 [%r6], %r8; +; PTX-NEXT: ld.global.nc.b32 %r8, [%r7]; +; PTX-NEXT: st.global.b32 [%r6], %r8; ; PTX-NEXT: ret; %select = select i1 %c, ptr %a, ptr %b %select_global = addrspacecast ptr %select to ptr addrspace(1) @@ -81,11 +81,11 @@ define void @not_kernel(ptr noalias readonly %a, ptr %out) { ; PTX-NEXT: .reg .b32 %f<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u32 %r1, [not_kernel_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [not_kernel_param_0]; ; PTX-NEXT: cvta.to.global.u32 %r2, %r1; -; PTX-NEXT: ld.param.u32 %r3, [not_kernel_param_1]; -; PTX-NEXT: ld.global.f32 %f1, [%r2]; -; PTX-NEXT: st.f32 [%r3], %f1; +; PTX-NEXT: ld.param.b32 %r3, [not_kernel_param_1]; +; PTX-NEXT: ld.global.b32 %f1, [%r2]; +; PTX-NEXT: st.b32 [%r3], %f1; ; PTX-NEXT: ret; %a_global = addrspacecast ptr %a to ptr addrspace(1) %val = load float, ptr addrspace(1) %a_global @@ -114,17 +114,17 @@ define ptx_kernel void @global_load(ptr noalias readonly %a, i1 %c, ptr %out) { ; PTX-NEXT: .reg .b64 %rd<2>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u8 %rs1, [global_load_param_1]; +; PTX-NEXT: ld.param.b8 %rs1, [global_load_param_1]; ; PTX-NEXT: and.b16 %rs2, %rs1, 1; ; PTX-NEXT: setp.ne.b16 %p1, %rs2, 0; -; PTX-NEXT: ld.param.u32 %r1, [global_load_param_0]; +; PTX-NEXT: ld.param.b32 %r1, [global_load_param_0]; ; PTX-NEXT: cvta.to.global.u32 %r2, %r1; -; PTX-NEXT: ld.param.u32 %r3, [global_load_param_2]; +; PTX-NEXT: ld.param.b32 %r3, [global_load_param_2]; ; PTX-NEXT: cvta.to.global.u32 %r4, %r3; ; PTX-NEXT: mov.b32 %r5, G; ; PTX-NEXT: selp.b32 %r6, %r5, %r2, %p1; -; PTX-NEXT: ld.global.nc.u64 %rd1, [%r6]; -; PTX-NEXT: st.global.u64 [%r4], %rd1; +; PTX-NEXT: ld.global.nc.b64 %rd1, [%r6]; +; PTX-NEXT: st.global.b64 [%r4], %rd1; ; PTX-NEXT: ret; %g_global = addrspacecast ptr @G to ptr addrspace(1) %a_global = addrspacecast ptr %a to ptr addrspace(1) diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll index f80b5a5e16ea..9c60af914faf 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-alloc.ll @@ -17,8 +17,8 @@ define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_alloc_param_0]; -; CHECK_PTX64-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_param_1]; ; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1; ; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; ; CHECK_PTX64-NEXT: ret; @@ -29,8 +29,8 @@ define void @test_tcgen05_alloc(ptr %addr, i32 %ncols) { ; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_alloc_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.b32 [%rd1], %r1; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.b32 [%rd1], %r1; ; CHECK_PTX64_SHARED32-NEXT: ret; @@ -48,8 +48,8 @@ define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_alloc_shared_param_0]; -; CHECK_PTX64-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_alloc_shared_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_1]; ; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%rd1], %r1; ; CHECK_PTX64-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%rd1], %r1; ; CHECK_PTX64-NEXT: ret; @@ -59,8 +59,8 @@ define void @test_tcgen05_alloc_shared(ptr addrspace(3) %addr, i32 %ncols) { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_alloc_shared_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r2, [test_tcgen05_alloc_shared_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_alloc_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_alloc_shared_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%r1], %r2; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%r1], %r2; ; CHECK_PTX64_SHARED32-NEXT: ret; @@ -80,8 +80,8 @@ define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) { ; CHECK_PTX64-NEXT: .reg .b32 %r<3>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u32 %r1, [test_tcgen05_dealloc_param_0]; -; CHECK_PTX64-NEXT: ld.param.u32 %r2, [test_tcgen05_dealloc_param_1]; +; CHECK_PTX64-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_param_0]; +; CHECK_PTX64-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_param_1]; ; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2; ; CHECK_PTX64-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; ; CHECK_PTX64-NEXT: ret; @@ -91,8 +91,8 @@ define void @test_tcgen05_dealloc(ptr addrspace(6) %tmem_addr, i32 %ncols) { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<3>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_dealloc_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r2, [test_tcgen05_dealloc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_dealloc_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r2, [test_tcgen05_dealloc_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::1.sync.aligned.b32 %r1, %r2; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.dealloc.cta_group::2.sync.aligned.b32 %r1, %r2; ; CHECK_PTX64_SHARED32-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll index 6e0ec6bcf446..cc3b359d0624 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-commit.ll @@ -16,7 +16,7 @@ define void @test_tcgen05_commit(ptr %bar_addr) { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_param_0]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_param_0]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64-NEXT: ret; @@ -26,7 +26,7 @@ define void @test_tcgen05_commit(ptr %bar_addr) { ; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_param_0]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64_SHARED32-NEXT: ret; @@ -44,7 +44,7 @@ define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_shared_param_0]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_shared_param_0]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%rd1]; ; CHECK_PTX64-NEXT: ret; @@ -54,7 +54,7 @@ define void @test_tcgen05_commit_shared(ptr addrspace(3) %bar_addr) { ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_commit_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_shared_param_0]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [%r1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.b64 [%r1]; ; CHECK_PTX64_SHARED32-NEXT: ret; @@ -78,8 +78,8 @@ define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) { ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0]; -; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0]; +; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64-NEXT: ret; @@ -90,8 +90,8 @@ define void @test_tcgen05_commit_mc(ptr %bar_addr, i16 %cta_mask) { ; CHECK_PTX64_SHARED32-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64_SHARED32-NEXT: ret; @@ -110,8 +110,8 @@ define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_ ; CHECK_PTX64-NEXT: .reg .b64 %rd<2>; ; CHECK_PTX64-EMPTY: ; CHECK_PTX64-NEXT: // %bb.0: -; CHECK_PTX64-NEXT: ld.param.u64 %rd1, [test_tcgen05_commit_mc_shared_param_0]; -; CHECK_PTX64-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; +; CHECK_PTX64-NEXT: ld.param.b64 %rd1, [test_tcgen05_commit_mc_shared_param_0]; +; CHECK_PTX64-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%rd1], %rs1; ; CHECK_PTX64-NEXT: ret; @@ -122,8 +122,8 @@ define void @test_tcgen05_commit_mc_shared(ptr addrspace(3) %bar_addr, i16 %cta_ ; CHECK_PTX64_SHARED32-NEXT: .reg .b32 %r<2>; ; CHECK_PTX64_SHARED32-EMPTY: ; CHECK_PTX64_SHARED32-NEXT: // %bb.0: -; CHECK_PTX64_SHARED32-NEXT: ld.param.u32 %r1, [test_tcgen05_commit_mc_shared_param_0]; -; CHECK_PTX64_SHARED32-NEXT: ld.param.u16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b32 %r1, [test_tcgen05_commit_mc_shared_param_0]; +; CHECK_PTX64_SHARED32-NEXT: ld.param.b16 %rs1, [test_tcgen05_commit_mc_shared_param_1]; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; ; CHECK_PTX64_SHARED32-NEXT: tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::cluster.multicast::cluster.b64 [%r1], %rs1; ; CHECK_PTX64_SHARED32-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll index 50dc93325c28..780116c42380 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -10,8 +10,8 @@ define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -29,8 +29,8 @@ define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -48,8 +48,8 @@ define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -68,8 +68,8 @@ define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; ; CHECK-NEXT: ret; @@ -87,8 +87,8 @@ define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; ; CHECK-NEXT: ret; @@ -106,8 +106,8 @@ define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; ; CHECK-NEXT: ret; @@ -126,8 +126,8 @@ define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sde ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -145,8 +145,8 @@ define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -164,8 +164,8 @@ define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sde ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -183,8 +183,8 @@ define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sd ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -202,8 +202,8 @@ define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sd ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -221,8 +221,8 @@ define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -241,8 +241,8 @@ define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sde ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -260,8 +260,8 @@ define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -279,8 +279,8 @@ define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sde ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -298,8 +298,8 @@ define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sd ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -317,8 +317,8 @@ define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sd ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; @@ -336,8 +336,8 @@ define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; ; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll index 83dbcb1bc02b..7e65338c4525 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-ld.ll @@ -11,7 +11,7 @@ define void @nvvm_tcgen05_ld_16x64b(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x64b_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x64b_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x64b.x1.b32 {%r2}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x64b.x2.b32 {%r3, %r4}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x64b.x4.b32 {%r5, %r6, %r7, %r8}, [%r1]; @@ -46,7 +46,7 @@ define void @nvvm_tcgen05_ld_16x64b_pack(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x64b_pack_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x64b_pack_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x64b.x1.pack::16b.b32 {%r2}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x64b.x2.pack::16b.b32 {%r3, %r4}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x64b.x4.pack::16b.b32 {%r5, %r6, %r7, %r8}, [%r1]; @@ -81,7 +81,7 @@ define void @nvvm_tcgen05_ld_16x128b(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<256>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x128b_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x128b_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x128b.x1.b32 {%r2, %r3}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x128b.x2.b32 {%r4, %r5, %r6, %r7}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x128b.x4.b32 {%r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15}, [%r1]; @@ -113,7 +113,7 @@ define void @nvvm_tcgen05_ld_16x128b_pack(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<256>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x128b_pack_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x128b_pack_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x128b.x1.pack::16b.b32 {%r2, %r3}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x128b.x2.pack::16b.b32 {%r4, %r5, %r6, %r7}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x128b.x4.pack::16b.b32 {%r8, %r9, %r10, %r11, %r12, %r13, %r14, %r15}, [%r1]; @@ -145,7 +145,7 @@ define void @nvvm_tcgen05_ld_16x256b(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<254>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x256b_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x256b_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x256b.x1.b32 {%r2, %r3, %r4, %r5}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x256b.x2.b32 {%r6, %r7, %r8, %r9, %r10, %r11, %r12, %r13}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x256b.x4.b32 {%r14, %r15, %r16, %r17, %r18, %r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29}, [%r1]; @@ -174,7 +174,7 @@ define void @nvvm_tcgen05_ld_16x256b_pack(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<254>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x256b_pack_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x256b_pack_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x256b.x1.pack::16b.b32 {%r2, %r3, %r4, %r5}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x256b.x2.pack::16b.b32 {%r6, %r7, %r8, %r9, %r10, %r11, %r12, %r13}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x256b.x4.pack::16b.b32 {%r14, %r15, %r16, %r17, %r18, %r19, %r20, %r21, %r22, %r23, %r24, %r25, %r26, %r27, %r28, %r29}, [%r1]; @@ -203,7 +203,7 @@ define void @nvvm_tcgen05_ld_32x32b(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_32x32b_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.32x32b.x1.b32 {%r2}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.32x32b.x2.b32 {%r3, %r4}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.32x32b.x4.b32 {%r5, %r6, %r7, %r8}, [%r1]; @@ -237,7 +237,7 @@ define void @nvvm_tcgen05_ld_32x32b_pack(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_32x32b_pack_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_32x32b_pack_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.32x32b.x1.pack::16b.b32 {%r2}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.32x32b.x2.pack::16b.b32 {%r3, %r4}, [%r1]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.32x32b.x4.pack::16b.b32 {%r5, %r6, %r7, %r8}, [%r1]; @@ -272,7 +272,7 @@ define void @nvvm_tcgen05_ld_16x32bx2(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x32bx2_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x32bx2.x1.b32 {%r2}, [%r1], 2; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x32bx2.x2.b32 {%r3, %r4}, [%r1], 2; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x32bx2.x4.b32 {%r5, %r6, %r7, %r8}, [%r1], 2; @@ -306,7 +306,7 @@ define void @nvvm_tcgen05_ld_16x32bx2_pack(ptr addrspace(6) %taddr) { ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_ld_16x32bx2_pack_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_ld_16x32bx2_pack_param_0]; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x32bx2.x1.pack::16b.b32 {%r2}, [%r1], 2; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x32bx2.x2.pack::16b.b32 {%r3, %r4}, [%r1], 2; ; CHECK-NEXT: tcgen05.ld.sync.aligned.16x32bx2.x4.pack::16b.b32 {%r5, %r6, %r7, %r8}, [%r1], 2; diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll index 13a45b9d86dc..590d75533bb8 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -12,7 +12,7 @@ define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) { ; CHECK-NEXT: .reg .b32 %r<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_shift_param_0]; +; CHECK-NEXT: ld.param.b32 %r1, [test_tcgen05_shift_param_0]; ; CHECK-NEXT: tcgen05.shift.cta_group::1.down [%r1]; ; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll index c22f795193c7..c323a54d75d7 100644 --- a/llvm/test/CodeGen/NVPTX/tcgen05-st.ll +++ b/llvm/test/CodeGen/NVPTX/tcgen05-st.ll @@ -11,79 +11,79 @@ define void @nvvm_tcgen05_st_16x64b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32 ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x64b_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [nvvm_tcgen05_st_16x64b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x64b_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [nvvm_tcgen05_st_16x64b_param_1]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x1.b32 [%r1], {%r2}; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x2.b32 [%r1], {%r3, %r4}; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x4.b32 [%r1], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x8.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x16.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20}; -; CHECK-NEXT: ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x32.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36}; -; CHECK-NEXT: ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x64.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68}; -; CHECK-NEXT: ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x128.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 0) @@ -111,79 +111,79 @@ define void @nvvm_tcgen05_st_16x64b_unpack(ptr addrspace(6) %taddr, i32 %stv1, < ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x64b_unpack_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [nvvm_tcgen05_st_16x64b_unpack_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x64b_unpack_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [nvvm_tcgen05_st_16x64b_unpack_param_1]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x1.unpack::16b.b32 [%r1], {%r2}; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_unpack_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x64b_unpack_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x2.unpack::16b.b32 [%r1], {%r3, %r4}; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_unpack_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x64b_unpack_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x4.unpack::16b.b32 [%r1], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_unpack_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_unpack_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x64b_unpack_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x64b_unpack_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x8.unpack::16b.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_unpack_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_unpack_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_unpack_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_unpack_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x64b_unpack_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x64b_unpack_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x64b_unpack_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x64b_unpack_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x16.unpack::16b.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20}; -; CHECK-NEXT: ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_unpack_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_unpack_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_unpack_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_unpack_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_unpack_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_unpack_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_unpack_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_unpack_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x64b_unpack_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x64b_unpack_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x64b_unpack_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x64b_unpack_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x64b_unpack_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x64b_unpack_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x64b_unpack_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x64b_unpack_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x32.unpack::16b.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36}; -; CHECK-NEXT: ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_unpack_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_unpack_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_unpack_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_unpack_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_unpack_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_unpack_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_unpack_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_unpack_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_unpack_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_unpack_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_unpack_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_unpack_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_unpack_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_unpack_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_unpack_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_unpack_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x64b_unpack_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x64b_unpack_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x64b_unpack_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x64b_unpack_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x64b_unpack_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x64b_unpack_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x64b_unpack_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x64b_unpack_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x64b_unpack_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x64b_unpack_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x64b_unpack_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x64b_unpack_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x64b_unpack_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x64b_unpack_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x64b_unpack_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x64b_unpack_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x64.unpack::16b.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68}; -; CHECK-NEXT: ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_unpack_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_unpack_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_unpack_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_unpack_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_unpack_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_unpack_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_unpack_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_unpack_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_unpack_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_unpack_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_unpack_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_unpack_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_unpack_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_unpack_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_unpack_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_unpack_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_unpack_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_unpack_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_unpack_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_unpack_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_unpack_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_unpack_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_unpack_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_unpack_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_unpack_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_unpack_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_unpack_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_unpack_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_unpack_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_unpack_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_unpack_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_unpack_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x64b_unpack_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x64b_unpack_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x64b_unpack_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x64b_unpack_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x64b_unpack_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x64b_unpack_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x64b_unpack_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x64b_unpack_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x64b_unpack_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x64b_unpack_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x64b_unpack_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x64b_unpack_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x64b_unpack_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x64b_unpack_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x64b_unpack_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x64b_unpack_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x64b_unpack_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x64b_unpack_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x64b_unpack_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x64b_unpack_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x64b_unpack_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x64b_unpack_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x64b_unpack_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x64b_unpack_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x64b_unpack_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x64b_unpack_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x64b_unpack_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x64b_unpack_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x64b_unpack_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x64b_unpack_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x64b_unpack_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x64b_unpack_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x64b.x128.unpack::16b.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x64b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 1) @@ -211,77 +211,77 @@ define void @nvvm_tcgen05_st_16x128b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i3 ; CHECK-NEXT: .reg .b32 %r<256>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x128b_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x128b_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x1.b32 [%r1], {%r2, %r3}; -; CHECK-NEXT: ld.param.v4.u32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x2.b32 [%r1], {%r4, %r5, %r6, %r7}; -; CHECK-NEXT: ld.param.v4.u32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x4.b32 [%r1], {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}; -; CHECK-NEXT: ld.param.v4.u32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x8.b32 [%r1], {%r28, %r29, %r30, %r31, %r24, %r25, %r26, %r27, %r20, %r21, %r22, %r23, %r16, %r17, %r18, %r19}; -; CHECK-NEXT: ld.param.v4.u32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x16.b32 [%r1], {%r60, %r61, %r62, %r63, %r56, %r57, %r58, %r59, %r52, %r53, %r54, %r55, %r48, %r49, %r50, %r51, %r44, %r45, %r46, %r47, %r40, %r41, %r42, %r43, %r36, %r37, %r38, %r39, %r32, %r33, %r34, %r35}; -; CHECK-NEXT: ld.param.v4.u32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x32.b32 [%r1], {%r124, %r125, %r126, %r127, %r120, %r121, %r122, %r123, %r116, %r117, %r118, %r119, %r112, %r113, %r114, %r115, %r108, %r109, %r110, %r111, %r104, %r105, %r106, %r107, %r100, %r101, %r102, %r103, %r96, %r97, %r98, %r99, %r92, %r93, %r94, %r95, %r88, %r89, %r90, %r91, %r84, %r85, %r86, %r87, %r80, %r81, %r82, %r83, %r76, %r77, %r78, %r79, %r72, %r73, %r74, %r75, %r68, %r69, %r70, %r71, %r64, %r65, %r66, %r67}; -; CHECK-NEXT: ld.param.v4.u32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x64.b32 [%r1], {%r252, %r253, %r254, %r255, %r248, %r249, %r250, %r251, %r244, %r245, %r246, %r247, %r240, %r241, %r242, %r243, %r236, %r237, %r238, %r239, %r232, %r233, %r234, %r235, %r228, %r229, %r230, %r231, %r224, %r225, %r226, %r227, %r220, %r221, %r222, %r223, %r216, %r217, %r218, %r219, %r212, %r213, %r214, %r215, %r208, %r209, %r210, %r211, %r204, %r205, %r206, %r207, %r200, %r201, %r202, %r203, %r196, %r197, %r198, %r199, %r192, %r193, %r194, %r195, %r188, %r189, %r190, %r191, %r184, %r185, %r186, %r187, %r180, %r181, %r182, %r183, %r176, %r177, %r178, %r179, %r172, %r173, %r174, %r175, %r168, %r169, %r170, %r171, %r164, %r165, %r166, %r167, %r160, %r161, %r162, %r163, %r156, %r157, %r158, %r159, %r152, %r153, %r154, %r155, %r148, %r149, %r150, %r151, %r144, %r145, %r146, %r147, %r140, %r141, %r142, %r143, %r136, %r137, %r138, %r139, %r132, %r133, %r134, %r135, %r128, %r129, %r130, %r131}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) %taddr, <2 x i32> %stv2, i1 0) @@ -307,77 +307,77 @@ define void @nvvm_tcgen05_st_16x128b_unpack(ptr addrspace(6) %taddr, i32 %stv1, ; CHECK-NEXT: .reg .b32 %r<256>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x128b_unpack_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_unpack_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x128b_unpack_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r2, %r3}, [nvvm_tcgen05_st_16x128b_unpack_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x1.unpack::16b.b32 [%r1], {%r2, %r3}; -; CHECK-NEXT: ld.param.v4.u32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_unpack_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r4, %r5, %r6, %r7}, [nvvm_tcgen05_st_16x128b_unpack_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x2.unpack::16b.b32 [%r1], {%r4, %r5, %r6, %r7}; -; CHECK-NEXT: ld.param.v4.u32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_unpack_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_unpack_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r8, %r9, %r10, %r11}, [nvvm_tcgen05_st_16x128b_unpack_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r12, %r13, %r14, %r15}, [nvvm_tcgen05_st_16x128b_unpack_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x4.unpack::16b.b32 [%r1], {%r12, %r13, %r14, %r15, %r8, %r9, %r10, %r11}; -; CHECK-NEXT: ld.param.v4.u32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_unpack_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_unpack_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_unpack_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_unpack_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r16, %r17, %r18, %r19}, [nvvm_tcgen05_st_16x128b_unpack_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r20, %r21, %r22, %r23}, [nvvm_tcgen05_st_16x128b_unpack_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r24, %r25, %r26, %r27}, [nvvm_tcgen05_st_16x128b_unpack_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r28, %r29, %r30, %r31}, [nvvm_tcgen05_st_16x128b_unpack_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x8.unpack::16b.b32 [%r1], {%r28, %r29, %r30, %r31, %r24, %r25, %r26, %r27, %r20, %r21, %r22, %r23, %r16, %r17, %r18, %r19}; -; CHECK-NEXT: ld.param.v4.u32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_unpack_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_unpack_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_unpack_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_unpack_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_unpack_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_unpack_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_unpack_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_unpack_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r32, %r33, %r34, %r35}, [nvvm_tcgen05_st_16x128b_unpack_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r36, %r37, %r38, %r39}, [nvvm_tcgen05_st_16x128b_unpack_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r40, %r41, %r42, %r43}, [nvvm_tcgen05_st_16x128b_unpack_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r44, %r45, %r46, %r47}, [nvvm_tcgen05_st_16x128b_unpack_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r48, %r49, %r50, %r51}, [nvvm_tcgen05_st_16x128b_unpack_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r52, %r53, %r54, %r55}, [nvvm_tcgen05_st_16x128b_unpack_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r56, %r57, %r58, %r59}, [nvvm_tcgen05_st_16x128b_unpack_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r60, %r61, %r62, %r63}, [nvvm_tcgen05_st_16x128b_unpack_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x16.unpack::16b.b32 [%r1], {%r60, %r61, %r62, %r63, %r56, %r57, %r58, %r59, %r52, %r53, %r54, %r55, %r48, %r49, %r50, %r51, %r44, %r45, %r46, %r47, %r40, %r41, %r42, %r43, %r36, %r37, %r38, %r39, %r32, %r33, %r34, %r35}; -; CHECK-NEXT: ld.param.v4.u32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_unpack_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_unpack_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_unpack_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_unpack_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_unpack_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_unpack_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_unpack_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_unpack_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_unpack_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_unpack_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_unpack_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_unpack_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_unpack_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_unpack_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_unpack_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_unpack_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r64, %r65, %r66, %r67}, [nvvm_tcgen05_st_16x128b_unpack_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r68, %r69, %r70, %r71}, [nvvm_tcgen05_st_16x128b_unpack_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r72, %r73, %r74, %r75}, [nvvm_tcgen05_st_16x128b_unpack_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r76, %r77, %r78, %r79}, [nvvm_tcgen05_st_16x128b_unpack_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r80, %r81, %r82, %r83}, [nvvm_tcgen05_st_16x128b_unpack_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r84, %r85, %r86, %r87}, [nvvm_tcgen05_st_16x128b_unpack_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r88, %r89, %r90, %r91}, [nvvm_tcgen05_st_16x128b_unpack_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r92, %r93, %r94, %r95}, [nvvm_tcgen05_st_16x128b_unpack_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r96, %r97, %r98, %r99}, [nvvm_tcgen05_st_16x128b_unpack_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r100, %r101, %r102, %r103}, [nvvm_tcgen05_st_16x128b_unpack_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r104, %r105, %r106, %r107}, [nvvm_tcgen05_st_16x128b_unpack_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r108, %r109, %r110, %r111}, [nvvm_tcgen05_st_16x128b_unpack_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r112, %r113, %r114, %r115}, [nvvm_tcgen05_st_16x128b_unpack_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r116, %r117, %r118, %r119}, [nvvm_tcgen05_st_16x128b_unpack_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r120, %r121, %r122, %r123}, [nvvm_tcgen05_st_16x128b_unpack_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r124, %r125, %r126, %r127}, [nvvm_tcgen05_st_16x128b_unpack_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x32.unpack::16b.b32 [%r1], {%r124, %r125, %r126, %r127, %r120, %r121, %r122, %r123, %r116, %r117, %r118, %r119, %r112, %r113, %r114, %r115, %r108, %r109, %r110, %r111, %r104, %r105, %r106, %r107, %r100, %r101, %r102, %r103, %r96, %r97, %r98, %r99, %r92, %r93, %r94, %r95, %r88, %r89, %r90, %r91, %r84, %r85, %r86, %r87, %r80, %r81, %r82, %r83, %r76, %r77, %r78, %r79, %r72, %r73, %r74, %r75, %r68, %r69, %r70, %r71, %r64, %r65, %r66, %r67}; -; CHECK-NEXT: ld.param.v4.u32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_unpack_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_unpack_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_unpack_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_unpack_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_unpack_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_unpack_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_unpack_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_unpack_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_unpack_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_unpack_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_unpack_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_unpack_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_unpack_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_unpack_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_unpack_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_unpack_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_unpack_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_unpack_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_unpack_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_unpack_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_unpack_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_unpack_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_unpack_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_unpack_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_unpack_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_unpack_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_unpack_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_unpack_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_unpack_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_unpack_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_unpack_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_unpack_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r128, %r129, %r130, %r131}, [nvvm_tcgen05_st_16x128b_unpack_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r132, %r133, %r134, %r135}, [nvvm_tcgen05_st_16x128b_unpack_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r136, %r137, %r138, %r139}, [nvvm_tcgen05_st_16x128b_unpack_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r140, %r141, %r142, %r143}, [nvvm_tcgen05_st_16x128b_unpack_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r144, %r145, %r146, %r147}, [nvvm_tcgen05_st_16x128b_unpack_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r148, %r149, %r150, %r151}, [nvvm_tcgen05_st_16x128b_unpack_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r152, %r153, %r154, %r155}, [nvvm_tcgen05_st_16x128b_unpack_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r156, %r157, %r158, %r159}, [nvvm_tcgen05_st_16x128b_unpack_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r160, %r161, %r162, %r163}, [nvvm_tcgen05_st_16x128b_unpack_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r164, %r165, %r166, %r167}, [nvvm_tcgen05_st_16x128b_unpack_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r168, %r169, %r170, %r171}, [nvvm_tcgen05_st_16x128b_unpack_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r172, %r173, %r174, %r175}, [nvvm_tcgen05_st_16x128b_unpack_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r176, %r177, %r178, %r179}, [nvvm_tcgen05_st_16x128b_unpack_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r180, %r181, %r182, %r183}, [nvvm_tcgen05_st_16x128b_unpack_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r184, %r185, %r186, %r187}, [nvvm_tcgen05_st_16x128b_unpack_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r188, %r189, %r190, %r191}, [nvvm_tcgen05_st_16x128b_unpack_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r192, %r193, %r194, %r195}, [nvvm_tcgen05_st_16x128b_unpack_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r196, %r197, %r198, %r199}, [nvvm_tcgen05_st_16x128b_unpack_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r200, %r201, %r202, %r203}, [nvvm_tcgen05_st_16x128b_unpack_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r204, %r205, %r206, %r207}, [nvvm_tcgen05_st_16x128b_unpack_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r208, %r209, %r210, %r211}, [nvvm_tcgen05_st_16x128b_unpack_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r212, %r213, %r214, %r215}, [nvvm_tcgen05_st_16x128b_unpack_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r216, %r217, %r218, %r219}, [nvvm_tcgen05_st_16x128b_unpack_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r220, %r221, %r222, %r223}, [nvvm_tcgen05_st_16x128b_unpack_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r224, %r225, %r226, %r227}, [nvvm_tcgen05_st_16x128b_unpack_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r228, %r229, %r230, %r231}, [nvvm_tcgen05_st_16x128b_unpack_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r232, %r233, %r234, %r235}, [nvvm_tcgen05_st_16x128b_unpack_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r236, %r237, %r238, %r239}, [nvvm_tcgen05_st_16x128b_unpack_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r240, %r241, %r242, %r243}, [nvvm_tcgen05_st_16x128b_unpack_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r244, %r245, %r246, %r247}, [nvvm_tcgen05_st_16x128b_unpack_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r248, %r249, %r250, %r251}, [nvvm_tcgen05_st_16x128b_unpack_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r252, %r253, %r254, %r255}, [nvvm_tcgen05_st_16x128b_unpack_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x128b.x64.unpack::16b.b32 [%r1], {%r252, %r253, %r254, %r255, %r248, %r249, %r250, %r251, %r244, %r245, %r246, %r247, %r240, %r241, %r242, %r243, %r236, %r237, %r238, %r239, %r232, %r233, %r234, %r235, %r228, %r229, %r230, %r231, %r224, %r225, %r226, %r227, %r220, %r221, %r222, %r223, %r216, %r217, %r218, %r219, %r212, %r213, %r214, %r215, %r208, %r209, %r210, %r211, %r204, %r205, %r206, %r207, %r200, %r201, %r202, %r203, %r196, %r197, %r198, %r199, %r192, %r193, %r194, %r195, %r188, %r189, %r190, %r191, %r184, %r185, %r186, %r187, %r180, %r181, %r182, %r183, %r176, %r177, %r178, %r179, %r172, %r173, %r174, %r175, %r168, %r169, %r170, %r171, %r164, %r165, %r166, %r167, %r160, %r161, %r162, %r163, %r156, %r157, %r158, %r159, %r152, %r153, %r154, %r155, %r148, %r149, %r150, %r151, %r144, %r145, %r146, %r147, %r140, %r141, %r142, %r143, %r136, %r137, %r138, %r139, %r132, %r133, %r134, %r135, %r128, %r129, %r130, %r131}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x128b.x1(ptr addrspace(6) %taddr, <2 x i32> %stv2, i1 1) @@ -403,75 +403,75 @@ define void @nvvm_tcgen05_st_16x256b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i3 ; CHECK-NEXT: .reg .b32 %r<254>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x256b_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_param_3]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x256b_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x1.b32 [%r1], {%r2, %r3, %r4, %r5}; -; CHECK-NEXT: ld.param.v4.u32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x2.b32 [%r1], {%r10, %r11, %r12, %r13, %r6, %r7, %r8, %r9}; -; CHECK-NEXT: ld.param.v4.u32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x4.b32 [%r1], {%r26, %r27, %r28, %r29, %r22, %r23, %r24, %r25, %r18, %r19, %r20, %r21, %r14, %r15, %r16, %r17}; -; CHECK-NEXT: ld.param.v4.u32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x8.b32 [%r1], {%r58, %r59, %r60, %r61, %r54, %r55, %r56, %r57, %r50, %r51, %r52, %r53, %r46, %r47, %r48, %r49, %r42, %r43, %r44, %r45, %r38, %r39, %r40, %r41, %r34, %r35, %r36, %r37, %r30, %r31, %r32, %r33}; -; CHECK-NEXT: ld.param.v4.u32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x16.b32 [%r1], {%r122, %r123, %r124, %r125, %r118, %r119, %r120, %r121, %r114, %r115, %r116, %r117, %r110, %r111, %r112, %r113, %r106, %r107, %r108, %r109, %r102, %r103, %r104, %r105, %r98, %r99, %r100, %r101, %r94, %r95, %r96, %r97, %r90, %r91, %r92, %r93, %r86, %r87, %r88, %r89, %r82, %r83, %r84, %r85, %r78, %r79, %r80, %r81, %r74, %r75, %r76, %r77, %r70, %r71, %r72, %r73, %r66, %r67, %r68, %r69, %r62, %r63, %r64, %r65}; -; CHECK-NEXT: ld.param.v4.u32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x32.b32 [%r1], {%r250, %r251, %r252, %r253, %r246, %r247, %r248, %r249, %r242, %r243, %r244, %r245, %r238, %r239, %r240, %r241, %r234, %r235, %r236, %r237, %r230, %r231, %r232, %r233, %r226, %r227, %r228, %r229, %r222, %r223, %r224, %r225, %r218, %r219, %r220, %r221, %r214, %r215, %r216, %r217, %r210, %r211, %r212, %r213, %r206, %r207, %r208, %r209, %r202, %r203, %r204, %r205, %r198, %r199, %r200, %r201, %r194, %r195, %r196, %r197, %r190, %r191, %r192, %r193, %r186, %r187, %r188, %r189, %r182, %r183, %r184, %r185, %r178, %r179, %r180, %r181, %r174, %r175, %r176, %r177, %r170, %r171, %r172, %r173, %r166, %r167, %r168, %r169, %r162, %r163, %r164, %r165, %r158, %r159, %r160, %r161, %r154, %r155, %r156, %r157, %r150, %r151, %r152, %r153, %r146, %r147, %r148, %r149, %r142, %r143, %r144, %r145, %r138, %r139, %r140, %r141, %r134, %r135, %r136, %r137, %r130, %r131, %r132, %r133, %r126, %r127, %r128, %r129}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) %taddr, <4 x i32> %stv4, i1 0) @@ -495,75 +495,75 @@ define void @nvvm_tcgen05_st_16x256b_unpack(ptr addrspace(6) %taddr, i32 %stv1, ; CHECK-NEXT: .reg .b32 %r<254>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x256b_unpack_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_unpack_param_3]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x256b_unpack_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r2, %r3, %r4, %r5}, [nvvm_tcgen05_st_16x256b_unpack_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x1.unpack::16b.b32 [%r1], {%r2, %r3, %r4, %r5}; -; CHECK-NEXT: ld.param.v4.u32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_unpack_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_unpack_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r6, %r7, %r8, %r9}, [nvvm_tcgen05_st_16x256b_unpack_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [nvvm_tcgen05_st_16x256b_unpack_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x2.unpack::16b.b32 [%r1], {%r10, %r11, %r12, %r13, %r6, %r7, %r8, %r9}; -; CHECK-NEXT: ld.param.v4.u32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_unpack_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_unpack_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_unpack_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_unpack_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r14, %r15, %r16, %r17}, [nvvm_tcgen05_st_16x256b_unpack_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r18, %r19, %r20, %r21}, [nvvm_tcgen05_st_16x256b_unpack_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r22, %r23, %r24, %r25}, [nvvm_tcgen05_st_16x256b_unpack_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r26, %r27, %r28, %r29}, [nvvm_tcgen05_st_16x256b_unpack_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x4.unpack::16b.b32 [%r1], {%r26, %r27, %r28, %r29, %r22, %r23, %r24, %r25, %r18, %r19, %r20, %r21, %r14, %r15, %r16, %r17}; -; CHECK-NEXT: ld.param.v4.u32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_unpack_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_unpack_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_unpack_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_unpack_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_unpack_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_unpack_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_unpack_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_unpack_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r30, %r31, %r32, %r33}, [nvvm_tcgen05_st_16x256b_unpack_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r34, %r35, %r36, %r37}, [nvvm_tcgen05_st_16x256b_unpack_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r38, %r39, %r40, %r41}, [nvvm_tcgen05_st_16x256b_unpack_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r42, %r43, %r44, %r45}, [nvvm_tcgen05_st_16x256b_unpack_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r46, %r47, %r48, %r49}, [nvvm_tcgen05_st_16x256b_unpack_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r50, %r51, %r52, %r53}, [nvvm_tcgen05_st_16x256b_unpack_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r54, %r55, %r56, %r57}, [nvvm_tcgen05_st_16x256b_unpack_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r58, %r59, %r60, %r61}, [nvvm_tcgen05_st_16x256b_unpack_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x8.unpack::16b.b32 [%r1], {%r58, %r59, %r60, %r61, %r54, %r55, %r56, %r57, %r50, %r51, %r52, %r53, %r46, %r47, %r48, %r49, %r42, %r43, %r44, %r45, %r38, %r39, %r40, %r41, %r34, %r35, %r36, %r37, %r30, %r31, %r32, %r33}; -; CHECK-NEXT: ld.param.v4.u32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_unpack_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_unpack_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_unpack_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_unpack_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_unpack_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_unpack_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_unpack_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_unpack_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_unpack_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_unpack_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_unpack_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_unpack_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_unpack_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_unpack_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_unpack_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_unpack_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r62, %r63, %r64, %r65}, [nvvm_tcgen05_st_16x256b_unpack_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r66, %r67, %r68, %r69}, [nvvm_tcgen05_st_16x256b_unpack_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r70, %r71, %r72, %r73}, [nvvm_tcgen05_st_16x256b_unpack_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r74, %r75, %r76, %r77}, [nvvm_tcgen05_st_16x256b_unpack_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r78, %r79, %r80, %r81}, [nvvm_tcgen05_st_16x256b_unpack_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r82, %r83, %r84, %r85}, [nvvm_tcgen05_st_16x256b_unpack_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r86, %r87, %r88, %r89}, [nvvm_tcgen05_st_16x256b_unpack_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r90, %r91, %r92, %r93}, [nvvm_tcgen05_st_16x256b_unpack_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r94, %r95, %r96, %r97}, [nvvm_tcgen05_st_16x256b_unpack_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r98, %r99, %r100, %r101}, [nvvm_tcgen05_st_16x256b_unpack_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r102, %r103, %r104, %r105}, [nvvm_tcgen05_st_16x256b_unpack_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r106, %r107, %r108, %r109}, [nvvm_tcgen05_st_16x256b_unpack_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r110, %r111, %r112, %r113}, [nvvm_tcgen05_st_16x256b_unpack_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r114, %r115, %r116, %r117}, [nvvm_tcgen05_st_16x256b_unpack_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r118, %r119, %r120, %r121}, [nvvm_tcgen05_st_16x256b_unpack_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r122, %r123, %r124, %r125}, [nvvm_tcgen05_st_16x256b_unpack_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x16.unpack::16b.b32 [%r1], {%r122, %r123, %r124, %r125, %r118, %r119, %r120, %r121, %r114, %r115, %r116, %r117, %r110, %r111, %r112, %r113, %r106, %r107, %r108, %r109, %r102, %r103, %r104, %r105, %r98, %r99, %r100, %r101, %r94, %r95, %r96, %r97, %r90, %r91, %r92, %r93, %r86, %r87, %r88, %r89, %r82, %r83, %r84, %r85, %r78, %r79, %r80, %r81, %r74, %r75, %r76, %r77, %r70, %r71, %r72, %r73, %r66, %r67, %r68, %r69, %r62, %r63, %r64, %r65}; -; CHECK-NEXT: ld.param.v4.u32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_unpack_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_unpack_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_unpack_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_unpack_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_unpack_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_unpack_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_unpack_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_unpack_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_unpack_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_unpack_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_unpack_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_unpack_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_unpack_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_unpack_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_unpack_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_unpack_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_unpack_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_unpack_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_unpack_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_unpack_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_unpack_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_unpack_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_unpack_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_unpack_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_unpack_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_unpack_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_unpack_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_unpack_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_unpack_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_unpack_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_unpack_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_unpack_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r126, %r127, %r128, %r129}, [nvvm_tcgen05_st_16x256b_unpack_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r130, %r131, %r132, %r133}, [nvvm_tcgen05_st_16x256b_unpack_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r134, %r135, %r136, %r137}, [nvvm_tcgen05_st_16x256b_unpack_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r138, %r139, %r140, %r141}, [nvvm_tcgen05_st_16x256b_unpack_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r142, %r143, %r144, %r145}, [nvvm_tcgen05_st_16x256b_unpack_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r146, %r147, %r148, %r149}, [nvvm_tcgen05_st_16x256b_unpack_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r150, %r151, %r152, %r153}, [nvvm_tcgen05_st_16x256b_unpack_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r154, %r155, %r156, %r157}, [nvvm_tcgen05_st_16x256b_unpack_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r158, %r159, %r160, %r161}, [nvvm_tcgen05_st_16x256b_unpack_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r162, %r163, %r164, %r165}, [nvvm_tcgen05_st_16x256b_unpack_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r166, %r167, %r168, %r169}, [nvvm_tcgen05_st_16x256b_unpack_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r170, %r171, %r172, %r173}, [nvvm_tcgen05_st_16x256b_unpack_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r174, %r175, %r176, %r177}, [nvvm_tcgen05_st_16x256b_unpack_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r178, %r179, %r180, %r181}, [nvvm_tcgen05_st_16x256b_unpack_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r182, %r183, %r184, %r185}, [nvvm_tcgen05_st_16x256b_unpack_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r186, %r187, %r188, %r189}, [nvvm_tcgen05_st_16x256b_unpack_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r190, %r191, %r192, %r193}, [nvvm_tcgen05_st_16x256b_unpack_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r194, %r195, %r196, %r197}, [nvvm_tcgen05_st_16x256b_unpack_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r198, %r199, %r200, %r201}, [nvvm_tcgen05_st_16x256b_unpack_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r202, %r203, %r204, %r205}, [nvvm_tcgen05_st_16x256b_unpack_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r206, %r207, %r208, %r209}, [nvvm_tcgen05_st_16x256b_unpack_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r210, %r211, %r212, %r213}, [nvvm_tcgen05_st_16x256b_unpack_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r214, %r215, %r216, %r217}, [nvvm_tcgen05_st_16x256b_unpack_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r218, %r219, %r220, %r221}, [nvvm_tcgen05_st_16x256b_unpack_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r222, %r223, %r224, %r225}, [nvvm_tcgen05_st_16x256b_unpack_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r226, %r227, %r228, %r229}, [nvvm_tcgen05_st_16x256b_unpack_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r230, %r231, %r232, %r233}, [nvvm_tcgen05_st_16x256b_unpack_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r234, %r235, %r236, %r237}, [nvvm_tcgen05_st_16x256b_unpack_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r238, %r239, %r240, %r241}, [nvvm_tcgen05_st_16x256b_unpack_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r242, %r243, %r244, %r245}, [nvvm_tcgen05_st_16x256b_unpack_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r246, %r247, %r248, %r249}, [nvvm_tcgen05_st_16x256b_unpack_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r250, %r251, %r252, %r253}, [nvvm_tcgen05_st_16x256b_unpack_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x256b.x32.unpack::16b.b32 [%r1], {%r250, %r251, %r252, %r253, %r246, %r247, %r248, %r249, %r242, %r243, %r244, %r245, %r238, %r239, %r240, %r241, %r234, %r235, %r236, %r237, %r230, %r231, %r232, %r233, %r226, %r227, %r228, %r229, %r222, %r223, %r224, %r225, %r218, %r219, %r220, %r221, %r214, %r215, %r216, %r217, %r210, %r211, %r212, %r213, %r206, %r207, %r208, %r209, %r202, %r203, %r204, %r205, %r198, %r199, %r200, %r201, %r194, %r195, %r196, %r197, %r190, %r191, %r192, %r193, %r186, %r187, %r188, %r189, %r182, %r183, %r184, %r185, %r178, %r179, %r180, %r181, %r174, %r175, %r176, %r177, %r170, %r171, %r172, %r173, %r166, %r167, %r168, %r169, %r162, %r163, %r164, %r165, %r158, %r159, %r160, %r161, %r154, %r155, %r156, %r157, %r150, %r151, %r152, %r153, %r146, %r147, %r148, %r149, %r142, %r143, %r144, %r145, %r138, %r139, %r140, %r141, %r134, %r135, %r136, %r137, %r130, %r131, %r132, %r133, %r126, %r127, %r128, %r129}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x256b.x1(ptr addrspace(6) %taddr, <4 x i32> %stv4, i1 1) @@ -587,79 +587,79 @@ define void @nvvm_tcgen05_st_32x32b(ptr addrspace(6) %taddr, i32 %stv1, <2 x i32 ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_32x32b_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [nvvm_tcgen05_st_32x32b_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_32x32b_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [nvvm_tcgen05_st_32x32b_param_1]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x1.b32 [%r1], {%r2}; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x2.b32 [%r1], {%r3, %r4}; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x4.b32 [%r1], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x8.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x16.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20}; -; CHECK-NEXT: ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x32.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36}; -; CHECK-NEXT: ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x64.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68}; -; CHECK-NEXT: ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x128.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 0) @@ -687,79 +687,79 @@ define void @nvvm_tcgen05_st_32x32b_unpack(ptr addrspace(6) %taddr, i32 %stv1, < ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_32x32b_unpack_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [nvvm_tcgen05_st_32x32b_unpack_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_32x32b_unpack_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [nvvm_tcgen05_st_32x32b_unpack_param_1]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x1.unpack::16b.b32 [%r1], {%r2}; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_unpack_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_32x32b_unpack_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x2.unpack::16b.b32 [%r1], {%r3, %r4}; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_unpack_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_32x32b_unpack_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x4.unpack::16b.b32 [%r1], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_unpack_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_unpack_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_32x32b_unpack_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_32x32b_unpack_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x8.unpack::16b.b32 [%r1], {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_unpack_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_unpack_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_unpack_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_unpack_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_32x32b_unpack_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_32x32b_unpack_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_32x32b_unpack_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_32x32b_unpack_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x16.unpack::16b.b32 [%r1], {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20}; -; CHECK-NEXT: ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_unpack_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_unpack_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_unpack_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_unpack_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_unpack_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_unpack_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_unpack_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_unpack_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_32x32b_unpack_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_32x32b_unpack_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_32x32b_unpack_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_32x32b_unpack_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_32x32b_unpack_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_32x32b_unpack_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_32x32b_unpack_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_32x32b_unpack_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x32.unpack::16b.b32 [%r1], {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36}; -; CHECK-NEXT: ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_unpack_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_unpack_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_unpack_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_unpack_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_unpack_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_unpack_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_unpack_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_unpack_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_unpack_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_unpack_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_unpack_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_unpack_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_unpack_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_unpack_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_unpack_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_unpack_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_32x32b_unpack_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_32x32b_unpack_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_32x32b_unpack_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_32x32b_unpack_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_32x32b_unpack_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_32x32b_unpack_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_32x32b_unpack_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_32x32b_unpack_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_32x32b_unpack_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_32x32b_unpack_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_32x32b_unpack_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_32x32b_unpack_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_32x32b_unpack_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_32x32b_unpack_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_32x32b_unpack_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_32x32b_unpack_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x64.unpack::16b.b32 [%r1], {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68}; -; CHECK-NEXT: ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_unpack_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_unpack_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_unpack_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_unpack_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_unpack_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_unpack_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_unpack_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_unpack_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_unpack_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_unpack_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_unpack_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_unpack_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_unpack_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_unpack_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_unpack_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_unpack_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_unpack_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_unpack_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_unpack_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_unpack_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_unpack_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_unpack_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_unpack_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_unpack_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_unpack_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_unpack_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_unpack_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_unpack_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_unpack_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_unpack_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_unpack_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_unpack_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_32x32b_unpack_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_32x32b_unpack_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_32x32b_unpack_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_32x32b_unpack_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_32x32b_unpack_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_32x32b_unpack_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_32x32b_unpack_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_32x32b_unpack_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_32x32b_unpack_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_32x32b_unpack_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_32x32b_unpack_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_32x32b_unpack_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_32x32b_unpack_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_32x32b_unpack_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_32x32b_unpack_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_32x32b_unpack_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_32x32b_unpack_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_32x32b_unpack_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_32x32b_unpack_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_32x32b_unpack_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_32x32b_unpack_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_32x32b_unpack_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_32x32b_unpack_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_32x32b_unpack_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_32x32b_unpack_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_32x32b_unpack_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_32x32b_unpack_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_32x32b_unpack_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_32x32b_unpack_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_32x32b_unpack_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_32x32b_unpack_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_32x32b_unpack_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.32x32b.x128.unpack::16b.b32 [%r1], {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.32x32b.x1(ptr addrspace(6) %taddr, i32 %stv1, i1 1) @@ -787,79 +787,79 @@ define void @nvvm_tcgen05_st_16x32bx2(ptr addrspace(6) %taddr, i32 %stv1, <2 x i ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x32bx2_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [nvvm_tcgen05_st_16x32bx2_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x32bx2_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [nvvm_tcgen05_st_16x32bx2_param_1]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x1.b32 [%r1], 2, {%r2}; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x2.b32 [%r1], 2, {%r3, %r4}; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x4.b32 [%r1], 2, {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x8.b32 [%r1], 2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x16.b32 [%r1], 2, {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20}; -; CHECK-NEXT: ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x32.b32 [%r1], 2, {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36}; -; CHECK-NEXT: ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x64.b32 [%r1], 2, {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68}; -; CHECK-NEXT: ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x128.b32 [%r1], 2, {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) %taddr, i64 2, i32 %stv1, i1 0) @@ -887,79 +887,79 @@ define void @nvvm_tcgen05_st_16x32bx2_unpack(ptr addrspace(6) %taddr, i32 %stv1, ; CHECK-NEXT: .reg .b32 %r<257>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [nvvm_tcgen05_st_16x32bx2_unpack_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [nvvm_tcgen05_st_16x32bx2_unpack_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [nvvm_tcgen05_st_16x32bx2_unpack_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [nvvm_tcgen05_st_16x32bx2_unpack_param_1]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x1.unpack::16b.b32 [%r1], 2, {%r2}; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_unpack_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [nvvm_tcgen05_st_16x32bx2_unpack_param_2]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x2.unpack::16b.b32 [%r1], 2, {%r3, %r4}; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_unpack_param_3]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [nvvm_tcgen05_st_16x32bx2_unpack_param_3]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x4.unpack::16b.b32 [%r1], 2, {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: ld.param.v4.u32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [nvvm_tcgen05_st_16x32bx2_unpack_param_4]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x8.unpack::16b.b32 [%r1], 2, {%r13, %r14, %r15, %r16, %r9, %r10, %r11, %r12}; -; CHECK-NEXT: ld.param.v4.u32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5]; +; CHECK-NEXT: ld.param.v4.b32 {%r17, %r18, %r19, %r20}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r21, %r22, %r23, %r24}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r25, %r26, %r27, %r28}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r29, %r30, %r31, %r32}, [nvvm_tcgen05_st_16x32bx2_unpack_param_5]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x16.unpack::16b.b32 [%r1], 2, {%r29, %r30, %r31, %r32, %r25, %r26, %r27, %r28, %r21, %r22, %r23, %r24, %r17, %r18, %r19, %r20}; -; CHECK-NEXT: ld.param.v4.u32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6]; +; CHECK-NEXT: ld.param.v4.b32 {%r33, %r34, %r35, %r36}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r37, %r38, %r39, %r40}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r41, %r42, %r43, %r44}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r45, %r46, %r47, %r48}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r49, %r50, %r51, %r52}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r53, %r54, %r55, %r56}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r57, %r58, %r59, %r60}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r61, %r62, %r63, %r64}, [nvvm_tcgen05_st_16x32bx2_unpack_param_6]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x32.unpack::16b.b32 [%r1], 2, {%r61, %r62, %r63, %r64, %r57, %r58, %r59, %r60, %r53, %r54, %r55, %r56, %r49, %r50, %r51, %r52, %r45, %r46, %r47, %r48, %r41, %r42, %r43, %r44, %r37, %r38, %r39, %r40, %r33, %r34, %r35, %r36}; -; CHECK-NEXT: ld.param.v4.u32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7]; +; CHECK-NEXT: ld.param.v4.b32 {%r65, %r66, %r67, %r68}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r69, %r70, %r71, %r72}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r73, %r74, %r75, %r76}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r77, %r78, %r79, %r80}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r81, %r82, %r83, %r84}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r85, %r86, %r87, %r88}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r89, %r90, %r91, %r92}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r93, %r94, %r95, %r96}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r97, %r98, %r99, %r100}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r101, %r102, %r103, %r104}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r105, %r106, %r107, %r108}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r109, %r110, %r111, %r112}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r113, %r114, %r115, %r116}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r117, %r118, %r119, %r120}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r121, %r122, %r123, %r124}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r125, %r126, %r127, %r128}, [nvvm_tcgen05_st_16x32bx2_unpack_param_7]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x64.unpack::16b.b32 [%r1], 2, {%r125, %r126, %r127, %r128, %r121, %r122, %r123, %r124, %r117, %r118, %r119, %r120, %r113, %r114, %r115, %r116, %r109, %r110, %r111, %r112, %r105, %r106, %r107, %r108, %r101, %r102, %r103, %r104, %r97, %r98, %r99, %r100, %r93, %r94, %r95, %r96, %r89, %r90, %r91, %r92, %r85, %r86, %r87, %r88, %r81, %r82, %r83, %r84, %r77, %r78, %r79, %r80, %r73, %r74, %r75, %r76, %r69, %r70, %r71, %r72, %r65, %r66, %r67, %r68}; -; CHECK-NEXT: ld.param.v4.u32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+496]; -; CHECK-NEXT: ld.param.v4.u32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+480]; -; CHECK-NEXT: ld.param.v4.u32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+464]; -; CHECK-NEXT: ld.param.v4.u32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+448]; -; CHECK-NEXT: ld.param.v4.u32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+432]; -; CHECK-NEXT: ld.param.v4.u32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+416]; -; CHECK-NEXT: ld.param.v4.u32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+400]; -; CHECK-NEXT: ld.param.v4.u32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+384]; -; CHECK-NEXT: ld.param.v4.u32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+368]; -; CHECK-NEXT: ld.param.v4.u32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+352]; -; CHECK-NEXT: ld.param.v4.u32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+336]; -; CHECK-NEXT: ld.param.v4.u32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+320]; -; CHECK-NEXT: ld.param.v4.u32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+304]; -; CHECK-NEXT: ld.param.v4.u32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+288]; -; CHECK-NEXT: ld.param.v4.u32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+272]; -; CHECK-NEXT: ld.param.v4.u32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+256]; -; CHECK-NEXT: ld.param.v4.u32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+240]; -; CHECK-NEXT: ld.param.v4.u32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+224]; -; CHECK-NEXT: ld.param.v4.u32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+208]; -; CHECK-NEXT: ld.param.v4.u32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+192]; -; CHECK-NEXT: ld.param.v4.u32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+176]; -; CHECK-NEXT: ld.param.v4.u32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+160]; -; CHECK-NEXT: ld.param.v4.u32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+144]; -; CHECK-NEXT: ld.param.v4.u32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+128]; -; CHECK-NEXT: ld.param.v4.u32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+112]; -; CHECK-NEXT: ld.param.v4.u32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+96]; -; CHECK-NEXT: ld.param.v4.u32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+80]; -; CHECK-NEXT: ld.param.v4.u32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+64]; -; CHECK-NEXT: ld.param.v4.u32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+48]; -; CHECK-NEXT: ld.param.v4.u32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+32]; -; CHECK-NEXT: ld.param.v4.u32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+16]; -; CHECK-NEXT: ld.param.v4.u32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8]; +; CHECK-NEXT: ld.param.v4.b32 {%r129, %r130, %r131, %r132}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+496]; +; CHECK-NEXT: ld.param.v4.b32 {%r133, %r134, %r135, %r136}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+480]; +; CHECK-NEXT: ld.param.v4.b32 {%r137, %r138, %r139, %r140}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+464]; +; CHECK-NEXT: ld.param.v4.b32 {%r141, %r142, %r143, %r144}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+448]; +; CHECK-NEXT: ld.param.v4.b32 {%r145, %r146, %r147, %r148}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+432]; +; CHECK-NEXT: ld.param.v4.b32 {%r149, %r150, %r151, %r152}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+416]; +; CHECK-NEXT: ld.param.v4.b32 {%r153, %r154, %r155, %r156}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+400]; +; CHECK-NEXT: ld.param.v4.b32 {%r157, %r158, %r159, %r160}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+384]; +; CHECK-NEXT: ld.param.v4.b32 {%r161, %r162, %r163, %r164}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+368]; +; CHECK-NEXT: ld.param.v4.b32 {%r165, %r166, %r167, %r168}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+352]; +; CHECK-NEXT: ld.param.v4.b32 {%r169, %r170, %r171, %r172}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+336]; +; CHECK-NEXT: ld.param.v4.b32 {%r173, %r174, %r175, %r176}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+320]; +; CHECK-NEXT: ld.param.v4.b32 {%r177, %r178, %r179, %r180}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+304]; +; CHECK-NEXT: ld.param.v4.b32 {%r181, %r182, %r183, %r184}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+288]; +; CHECK-NEXT: ld.param.v4.b32 {%r185, %r186, %r187, %r188}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+272]; +; CHECK-NEXT: ld.param.v4.b32 {%r189, %r190, %r191, %r192}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+256]; +; CHECK-NEXT: ld.param.v4.b32 {%r193, %r194, %r195, %r196}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+240]; +; CHECK-NEXT: ld.param.v4.b32 {%r197, %r198, %r199, %r200}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+224]; +; CHECK-NEXT: ld.param.v4.b32 {%r201, %r202, %r203, %r204}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+208]; +; CHECK-NEXT: ld.param.v4.b32 {%r205, %r206, %r207, %r208}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+192]; +; CHECK-NEXT: ld.param.v4.b32 {%r209, %r210, %r211, %r212}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+176]; +; CHECK-NEXT: ld.param.v4.b32 {%r213, %r214, %r215, %r216}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+160]; +; CHECK-NEXT: ld.param.v4.b32 {%r217, %r218, %r219, %r220}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+144]; +; CHECK-NEXT: ld.param.v4.b32 {%r221, %r222, %r223, %r224}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+128]; +; CHECK-NEXT: ld.param.v4.b32 {%r225, %r226, %r227, %r228}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+112]; +; CHECK-NEXT: ld.param.v4.b32 {%r229, %r230, %r231, %r232}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+96]; +; CHECK-NEXT: ld.param.v4.b32 {%r233, %r234, %r235, %r236}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+80]; +; CHECK-NEXT: ld.param.v4.b32 {%r237, %r238, %r239, %r240}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+64]; +; CHECK-NEXT: ld.param.v4.b32 {%r241, %r242, %r243, %r244}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+48]; +; CHECK-NEXT: ld.param.v4.b32 {%r245, %r246, %r247, %r248}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+32]; +; CHECK-NEXT: ld.param.v4.b32 {%r249, %r250, %r251, %r252}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8+16]; +; CHECK-NEXT: ld.param.v4.b32 {%r253, %r254, %r255, %r256}, [nvvm_tcgen05_st_16x32bx2_unpack_param_8]; ; CHECK-NEXT: tcgen05.st.sync.aligned.16x32bx2.x128.unpack::16b.b32 [%r1], 2, {%r253, %r254, %r255, %r256, %r249, %r250, %r251, %r252, %r245, %r246, %r247, %r248, %r241, %r242, %r243, %r244, %r237, %r238, %r239, %r240, %r233, %r234, %r235, %r236, %r229, %r230, %r231, %r232, %r225, %r226, %r227, %r228, %r221, %r222, %r223, %r224, %r217, %r218, %r219, %r220, %r213, %r214, %r215, %r216, %r209, %r210, %r211, %r212, %r205, %r206, %r207, %r208, %r201, %r202, %r203, %r204, %r197, %r198, %r199, %r200, %r193, %r194, %r195, %r196, %r189, %r190, %r191, %r192, %r185, %r186, %r187, %r188, %r181, %r182, %r183, %r184, %r177, %r178, %r179, %r180, %r173, %r174, %r175, %r176, %r169, %r170, %r171, %r172, %r165, %r166, %r167, %r168, %r161, %r162, %r163, %r164, %r157, %r158, %r159, %r160, %r153, %r154, %r155, %r156, %r149, %r150, %r151, %r152, %r145, %r146, %r147, %r148, %r141, %r142, %r143, %r144, %r137, %r138, %r139, %r140, %r133, %r134, %r135, %r136, %r129, %r130, %r131, %r132}; ; CHECK-NEXT: ret; tail call void @llvm.nvvm.tcgen05.st.16x32bx2.x1(ptr addrspace(6) %taddr, i64 2, i32 %stv1, i1 1) diff --git a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll index 4e4e3f3aaec6..3d6489a2340d 100644 --- a/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read-cuda.ll @@ -18,12 +18,12 @@ define ptx_kernel void @foo(i64 %img, ptr %red, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [foo_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [foo_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [foo_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [foo_param_1]; ; CHECK-NEXT: cvta.to.global.u64 %rd3, %rd2; -; CHECK-NEXT: ld.param.u32 %r1, [foo_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [foo_param_2]; ; CHECK-NEXT: tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [%rd1, {%r1}]; -; CHECK-NEXT: st.global.f32 [%rd3], %f1; +; CHECK-NEXT: st.global.b32 [%rd3], %f1; ; CHECK-NEXT: ret; %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %img, i32 %idx) %ret = extractvalue { float, float, float, float } %val, 0 @@ -42,11 +42,11 @@ define ptx_kernel void @bar(ptr %red, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [bar_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [bar_param_0]; ; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; -; CHECK-NEXT: ld.param.u32 %r1, [bar_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [bar_param_1]; ; CHECK-NEXT: tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [tex0, {%r1}]; -; CHECK-NEXT: st.global.f32 [%rd2], %f1; +; CHECK-NEXT: st.global.b32 [%rd2], %f1; ; CHECK-NEXT: ret; %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0) %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) @@ -65,9 +65,9 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-NEXT: .reg .b64 %rd<4>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [baz_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [baz_param_0]; ; CHECK-NEXT: cvta.to.global.u64 %rd2, %rd1; -; CHECK-NEXT: ld.param.u32 %r1, [baz_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [baz_param_1]; ; CHECK-NEXT: mov.u64 %rd3, tex0; ; CHECK-NEXT: tex.1d.v4.f32.s32 {%f1, %f2, %f3, %f4}, [tex0, {%r1}]; ; CHECK-NEXT: { // callseq 0, 0 @@ -79,10 +79,10 @@ define ptx_kernel void @baz(ptr %red, i32 %idx) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-NEXT: ld.param.f32 %f5, [retval0]; +; CHECK-NEXT: ld.param.b32 %f5, [retval0]; ; CHECK-NEXT: } // callseq 0 ; CHECK-NEXT: add.rn.f32 %f7, %f1, %f5; -; CHECK-NEXT: st.global.f32 [%rd2], %f7; +; CHECK-NEXT: st.global.b32 [%rd2], %f7; ; CHECK-NEXT: ret; %texHandle = tail call i64 @llvm.nvvm.texsurf.handle.internal.p1(ptr addrspace(1) @tex0) %val = tail call { float, float, float, float } @llvm.nvvm.tex.unified.1d.v4f32.s32(i64 %texHandle, i32 %idx) diff --git a/llvm/test/CodeGen/NVPTX/tex-read.ll b/llvm/test/CodeGen/NVPTX/tex-read.ll index d74c89f5abc8..22116b2fafc3 100644 --- a/llvm/test/CodeGen/NVPTX/tex-read.ll +++ b/llvm/test/CodeGen/NVPTX/tex-read.ll @@ -10,7 +10,7 @@ define ptx_kernel void @foo(i64 %img, i64 %sampler, ptr %red, i32 %idx) { ; CHECK: tex.1d.v4.f32.s32 {%f[[RED:[0-9]+]], %f[[GREEN:[0-9]+]], %f[[BLUE:[0-9]+]], %f[[ALPHA:[0-9]+]]}, [foo_param_0, foo_param_1, {%r{{[0-9]+}}}] %val = tail call { float, float, float, float } @llvm.nvvm.tex.1d.v4f32.s32(i64 %img, i64 %sampler, i32 %idx) %ret = extractvalue { float, float, float, float } %val, 0 -; CHECK: st.f32 [%rd{{[0-9]+}}], %f[[RED]] +; CHECK: st.b32 [%rd{{[0-9]+}}], %f[[RED]] store float %ret, ptr %red ret void } diff --git a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll index c9f9ccca82c6..4edbec48e6be 100644 --- a/llvm/test/CodeGen/NVPTX/texsurf-queries.ll +++ b/llvm/test/CodeGen/NVPTX/texsurf-queries.ll @@ -23,7 +23,7 @@ define i32 @t0(i64 %texHandle) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [t0_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [t0_param_0]; ; CHECK-NEXT: txq.width.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -54,7 +54,7 @@ define i32 @t2(i64 %texHandle) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [t2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [t2_param_0]; ; CHECK-NEXT: txq.height.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -85,7 +85,7 @@ define i32 @s0(i64 %surfHandle) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [s0_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [s0_param_0]; ; CHECK-NEXT: suq.width.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; @@ -116,7 +116,7 @@ define i32 @s2(i64 %surfHandle) { ; CHECK-NEXT: .reg .b64 %rd<2>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [s2_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [s2_param_0]; ; CHECK-NEXT: suq.height.b32 %r1, [%rd1]; ; CHECK-NEXT: st.param.b32 [func_retval0], %r1; ; CHECK-NEXT: ret; diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll index ff74df124b41..82ebb0ca5737 100644 --- a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll +++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll @@ -22,9 +22,9 @@ ; CHECK: .visible .func (.param .align 8 .b8 func_retval0[16]) ; CHECK-LABEL: test_s_i8i16p( ; CHECK: .param .align 8 .b8 test_s_i8i16p_param_0[16] -; CHECK-DAG: ld.param.u16 [[P0:%rs[0-9]+]], [test_s_i8i16p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8i16p_param_0+3]; -; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8i16p_param_0+4]; +; CHECK-DAG: ld.param.b16 [[P0:%rs[0-9]+]], [test_s_i8i16p_param_0]; +; CHECK-DAG: ld.param.b8 [[P2_0:%rs[0-9]+]], [test_s_i8i16p_param_0+3]; +; CHECK-DAG: ld.param.b8 [[P2_1:%rs[0-9]+]], [test_s_i8i16p_param_0+4]; ; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; ; CHECK: { // callseq @@ -59,11 +59,11 @@ define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) { ; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) ; CHECK-LABEL: test_s_i8i32p( ; CHECK: .param .align 8 .b8 test_s_i8i32p_param_0[24] -; CHECK-DAG: ld.param.u32 [[P0:%r[0-9]+]], [test_s_i8i32p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8i32p_param_0+5]; -; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8i32p_param_0+6]; -; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8i32p_param_0+7]; -; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8i32p_param_0+8]; +; CHECK-DAG: ld.param.b32 [[P0:%r[0-9]+]], [test_s_i8i32p_param_0]; +; CHECK-DAG: ld.param.b8 [[P2_0:%r[0-9]+]], [test_s_i8i32p_param_0+5]; +; CHECK-DAG: ld.param.b8 [[P2_1:%r[0-9]+]], [test_s_i8i32p_param_0+6]; +; CHECK-DAG: ld.param.b8 [[P2_2:%r[0-9]+]], [test_s_i8i32p_param_0+7]; +; CHECK-DAG: ld.param.b8 [[P2_3:%r[0-9]+]], [test_s_i8i32p_param_0+8]; ; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; ; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; @@ -106,15 +106,15 @@ define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) { ; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32]) ; CHECK-LABEL: test_s_i8i64p( ; CHECK: .param .align 8 .b8 test_s_i8i64p_param_0[32] -; CHECK-DAG: ld.param.u64 [[P0:%rd[0-9]+]], [test_s_i8i64p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8i64p_param_0+9]; -; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8i64p_param_0+10]; -; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8i64p_param_0+11]; -; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8i64p_param_0+12]; -; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8i64p_param_0+13]; -; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8i64p_param_0+14]; -; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8i64p_param_0+15]; -; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8i64p_param_0+16]; +; CHECK-DAG: ld.param.b64 [[P0:%rd[0-9]+]], [test_s_i8i64p_param_0]; +; CHECK-DAG: ld.param.b8 [[P2_0:%rd[0-9]+]], [test_s_i8i64p_param_0+9]; +; CHECK-DAG: ld.param.b8 [[P2_1:%rd[0-9]+]], [test_s_i8i64p_param_0+10]; +; CHECK-DAG: ld.param.b8 [[P2_2:%rd[0-9]+]], [test_s_i8i64p_param_0+11]; +; CHECK-DAG: ld.param.b8 [[P2_3:%rd[0-9]+]], [test_s_i8i64p_param_0+12]; +; CHECK-DAG: ld.param.b8 [[P2_4:%rd[0-9]+]], [test_s_i8i64p_param_0+13]; +; CHECK-DAG: ld.param.b8 [[P2_5:%rd[0-9]+]], [test_s_i8i64p_param_0+14]; +; CHECK-DAG: ld.param.b8 [[P2_6:%rd[0-9]+]], [test_s_i8i64p_param_0+15]; +; CHECK-DAG: ld.param.b8 [[P2_7:%rd[0-9]+]], [test_s_i8i64p_param_0+16]; ; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16; ; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24; @@ -182,8 +182,8 @@ define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) { ; CHECK-LABEL: test_s_i8f16p( ; CHECK: .param .align 8 .b8 test_s_i8f16p_param_0[16] ; CHECK-DAG: ld.param.b16 [[P0:%rs[0-9]+]], [test_s_i8f16p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%rs[0-9]+]], [test_s_i8f16p_param_0+3]; -; CHECK-DAG: ld.param.u8 [[P2_1:%rs[0-9]+]], [test_s_i8f16p_param_0+4]; +; CHECK-DAG: ld.param.b8 [[P2_0:%rs[0-9]+]], [test_s_i8f16p_param_0+3]; +; CHECK-DAG: ld.param.b8 [[P2_1:%rs[0-9]+]], [test_s_i8f16p_param_0+4]; ; CHECK-DAG: shl.b16 [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: or.b16 [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]]; ; CHECK: { // callseq @@ -219,10 +219,10 @@ define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) { ; CHECK-LABEL: test_s_i8f16x2p( ; CHECK: .param .align 8 .b8 test_s_i8f16x2p_param_0[24] ; CHECK-DAG: ld.param.b32 [[P0:%r[0-9]+]], [test_s_i8f16x2p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f16x2p_param_0+5]; -; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f16x2p_param_0+6]; -; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f16x2p_param_0+7]; -; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f16x2p_param_0+8]; +; CHECK-DAG: ld.param.b8 [[P2_0:%r[0-9]+]], [test_s_i8f16x2p_param_0+5]; +; CHECK-DAG: ld.param.b8 [[P2_1:%r[0-9]+]], [test_s_i8f16x2p_param_0+6]; +; CHECK-DAG: ld.param.b8 [[P2_2:%r[0-9]+]], [test_s_i8f16x2p_param_0+7]; +; CHECK-DAG: ld.param.b8 [[P2_3:%r[0-9]+]], [test_s_i8f16x2p_param_0+8]; ; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; ; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; @@ -265,11 +265,11 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK: .visible .func (.param .align 8 .b8 func_retval0[24]) ; CHECK-LABEL: test_s_i8f32p( ; CHECK: .param .align 8 .b8 test_s_i8f32p_param_0[24] -; CHECK-DAG: ld.param.f32 [[P0:%f[0-9]+]], [test_s_i8f32p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%r[0-9]+]], [test_s_i8f32p_param_0+5]; -; CHECK-DAG: ld.param.u8 [[P2_1:%r[0-9]+]], [test_s_i8f32p_param_0+6]; -; CHECK-DAG: ld.param.u8 [[P2_2:%r[0-9]+]], [test_s_i8f32p_param_0+7]; -; CHECK-DAG: ld.param.u8 [[P2_3:%r[0-9]+]], [test_s_i8f32p_param_0+8]; +; CHECK-DAG: ld.param.b32 [[P0:%f[0-9]+]], [test_s_i8f32p_param_0]; +; CHECK-DAG: ld.param.b8 [[P2_0:%r[0-9]+]], [test_s_i8f32p_param_0+5]; +; CHECK-DAG: ld.param.b8 [[P2_1:%r[0-9]+]], [test_s_i8f32p_param_0+6]; +; CHECK-DAG: ld.param.b8 [[P2_2:%r[0-9]+]], [test_s_i8f32p_param_0+7]; +; CHECK-DAG: ld.param.b8 [[P2_3:%r[0-9]+]], [test_s_i8f32p_param_0+8]; ; CHECK-DAG: shl.b32 [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: shl.b32 [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16; ; CHECK-DAG: shl.b32 [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24; @@ -280,7 +280,7 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-DAG: shr.u32 [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16; ; CHECK: { // callseq ; CHECK-DAG: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.f32 [param0], [[P0]]; +; CHECK-DAG: st.param.b32 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+5], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+6], [[P2_1_shr]]; ; CHECK-DAG: st.param.b8 [param0+7], [[P2_2_shr]]; @@ -291,13 +291,13 @@ define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.f32 [[R0:%f[0-9]+]], [retval0]; +; CHECK-DAG: ld.param.b32 [[R0:%f[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+5]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+6]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+7]; ; CHECK-DAG: ld.param.b8 [[R2_3:%rs[0-9]+]], [retval0+8]; ; CHECK: } // callseq -; CHECK-DAG: st.param.f32 [func_retval0], [[R0]]; +; CHECK-DAG: st.param.b32 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+5], ; CHECK-DAG: st.param.b8 [func_retval0+6], ; CHECK-DAG: st.param.b8 [func_retval0+7], @@ -312,15 +312,15 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK: .visible .func (.param .align 8 .b8 func_retval0[32]) ; CHECK-LABEL: test_s_i8f64p( ; CHECK: .param .align 8 .b8 test_s_i8f64p_param_0[32] -; CHECK-DAG: ld.param.f64 [[P0:%fd[0-9]+]], [test_s_i8f64p_param_0]; -; CHECK-DAG: ld.param.u8 [[P2_0:%rd[0-9]+]], [test_s_i8f64p_param_0+9]; -; CHECK-DAG: ld.param.u8 [[P2_1:%rd[0-9]+]], [test_s_i8f64p_param_0+10]; -; CHECK-DAG: ld.param.u8 [[P2_2:%rd[0-9]+]], [test_s_i8f64p_param_0+11]; -; CHECK-DAG: ld.param.u8 [[P2_3:%rd[0-9]+]], [test_s_i8f64p_param_0+12]; -; CHECK-DAG: ld.param.u8 [[P2_4:%rd[0-9]+]], [test_s_i8f64p_param_0+13]; -; CHECK-DAG: ld.param.u8 [[P2_5:%rd[0-9]+]], [test_s_i8f64p_param_0+14]; -; CHECK-DAG: ld.param.u8 [[P2_6:%rd[0-9]+]], [test_s_i8f64p_param_0+15]; -; CHECK-DAG: ld.param.u8 [[P2_7:%rd[0-9]+]], [test_s_i8f64p_param_0+16]; +; CHECK-DAG: ld.param.b64 [[P0:%fd[0-9]+]], [test_s_i8f64p_param_0]; +; CHECK-DAG: ld.param.b8 [[P2_0:%rd[0-9]+]], [test_s_i8f64p_param_0+9]; +; CHECK-DAG: ld.param.b8 [[P2_1:%rd[0-9]+]], [test_s_i8f64p_param_0+10]; +; CHECK-DAG: ld.param.b8 [[P2_2:%rd[0-9]+]], [test_s_i8f64p_param_0+11]; +; CHECK-DAG: ld.param.b8 [[P2_3:%rd[0-9]+]], [test_s_i8f64p_param_0+12]; +; CHECK-DAG: ld.param.b8 [[P2_4:%rd[0-9]+]], [test_s_i8f64p_param_0+13]; +; CHECK-DAG: ld.param.b8 [[P2_5:%rd[0-9]+]], [test_s_i8f64p_param_0+14]; +; CHECK-DAG: ld.param.b8 [[P2_6:%rd[0-9]+]], [test_s_i8f64p_param_0+15]; +; CHECK-DAG: ld.param.b8 [[P2_7:%rd[0-9]+]], [test_s_i8f64p_param_0+16]; ; CHECK-DAG: shl.b64 [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8; ; CHECK-DAG: shl.b64 [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16; ; CHECK-DAG: shl.b64 [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24; @@ -343,7 +343,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-DAG: bfe.u64 [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8; ; CHECK: { // callseq ; CHECK: .param .align 8 .b8 param0[32]; -; CHECK-DAG: st.param.f64 [param0], [[P0]]; +; CHECK-DAG: st.param.b64 [param0], [[P0]]; ; CHECK-DAG: st.param.b8 [param0+9], [[P2]]; ; CHECK-DAG: st.param.b8 [param0+10], [[P2_shr_1]]; ; CHECK-DAG: st.param.b8 [param0+11], [[P2_shr_2]]; @@ -358,7 +358,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-NEXT: ( ; CHECK-NEXT: param0 ; CHECK-NEXT: ); -; CHECK-DAG: ld.param.f64 [[R0:%fd[0-9]+]], [retval0]; +; CHECK-DAG: ld.param.b64 [[R0:%fd[0-9]+]], [retval0]; ; CHECK-DAG: ld.param.b8 [[R2_0:%rs[0-9]+]], [retval0+9]; ; CHECK-DAG: ld.param.b8 [[R2_1:%rs[0-9]+]], [retval0+10]; ; CHECK-DAG: ld.param.b8 [[R2_2:%rs[0-9]+]], [retval0+11]; @@ -368,7 +368,7 @@ define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) { ; CHECK-DAG: ld.param.b8 [[R2_6:%rs[0-9]+]], [retval0+15]; ; CHECK-DAG: ld.param.b8 [[R2_7:%rs[0-9]+]], [retval0+16]; ; CHECK: } // callseq -; CHECK-DAG: st.param.f64 [func_retval0], [[R0]]; +; CHECK-DAG: st.param.b64 [func_retval0], [[R0]]; ; CHECK-DAG: st.param.b8 [func_retval0+9], ; CHECK-DAG: st.param.b8 [func_retval0+10], ; CHECK-DAG: st.param.b8 [func_retval0+11], diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll index 303c649b794f..8e4c77e76029 100644 --- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll @@ -11,10 +11,10 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-NEXT: .reg .b16 %rs<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [out_v1i8_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [out_v1i8_param_2]; +; CHECK-NEXT: ld.param.b8 %rs1, [out_v1i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [out_v1i8_param_2]; ; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: ld.param.u8 %rs4, [out_v1i8_param_1]; +; CHECK-NEXT: ld.param.b8 %rs4, [out_v1i8_param_1]; ; CHECK-NEXT: not.b16 %rs5, %rs2; ; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; ; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; @@ -37,10 +37,10 @@ define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwin ; CHECK-NEXT: .reg .b16 %rs<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [out_v1i16_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [out_v1i16_param_2]; +; CHECK-NEXT: ld.param.b16 %rs1, [out_v1i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [out_v1i16_param_2]; ; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: ld.param.u16 %rs4, [out_v1i16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs4, [out_v1i16_param_1]; ; CHECK-NEXT: not.b16 %rs5, %rs2; ; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; ; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; @@ -63,9 +63,9 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [out_v4i8_param_0]; -; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [out_v4i8_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [out_v4i8_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [out_v4i8_param_2]; ; CHECK-NEXT: and.b32 %r4, %r2, %r3; ; CHECK-NEXT: xor.b32 %r5, %r3, -1; ; CHECK-NEXT: and.b32 %r6, %r1, %r5; @@ -85,9 +85,9 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v4i8_undef_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [out_v4i8_undef_param_0]; -; CHECK-NEXT: ld.param.u32 %r3, [out_v4i8_undef_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [out_v4i8_undef_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [out_v4i8_undef_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [out_v4i8_undef_param_2]; ; CHECK-NEXT: and.b32 %r4, %r2, %r3; ; CHECK-NEXT: xor.b32 %r5, %r3, -16711681; ; CHECK-NEXT: and.b32 %r6, %r1, %r5; @@ -107,9 +107,9 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v2i16_param_1]; -; CHECK-NEXT: ld.param.u32 %r2, [out_v2i16_param_0]; -; CHECK-NEXT: ld.param.u32 %r3, [out_v2i16_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [out_v2i16_param_1]; +; CHECK-NEXT: ld.param.b32 %r2, [out_v2i16_param_0]; +; CHECK-NEXT: ld.param.b32 %r3, [out_v2i16_param_2]; ; CHECK-NEXT: and.b32 %r4, %r2, %r3; ; CHECK-NEXT: xor.b32 %r5, %r3, -1; ; CHECK-NEXT: and.b32 %r6, %r1, %r5; @@ -129,10 +129,10 @@ define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [out_v1i32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [out_v1i32_param_2]; +; CHECK-NEXT: ld.param.b32 %r1, [out_v1i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [out_v1i32_param_2]; ; CHECK-NEXT: and.b32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.u32 %r4, [out_v1i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r4, [out_v1i32_param_1]; ; CHECK-NEXT: not.b32 %r5, %r2; ; CHECK-NEXT: and.b32 %r6, %r4, %r5; ; CHECK-NEXT: or.b32 %r7, %r3, %r6; @@ -155,11 +155,11 @@ define <8 x i8> @out_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v8i8_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v8i8_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [out_v8i8_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v8i8_param_2]; ; CHECK-NEXT: and.b32 %r5, %r1, %r3; ; CHECK-NEXT: and.b32 %r6, %r2, %r4; -; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v8i8_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [out_v8i8_param_1]; ; CHECK-NEXT: xor.b32 %r9, %r4, -1; ; CHECK-NEXT: xor.b32 %r10, %r3, -1; ; CHECK-NEXT: and.b32 %r11, %r7, %r10; @@ -181,11 +181,11 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v4i16_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [out_v4i16_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v4i16_param_2]; ; CHECK-NEXT: and.b32 %r5, %r1, %r3; ; CHECK-NEXT: and.b32 %r6, %r2, %r4; -; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v4i16_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [out_v4i16_param_1]; ; CHECK-NEXT: xor.b32 %r9, %r4, -1; ; CHECK-NEXT: xor.b32 %r10, %r3, -1; ; CHECK-NEXT: and.b32 %r11, %r7, %r10; @@ -207,11 +207,11 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v4i16_undef_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v4i16_undef_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [out_v4i16_undef_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v4i16_undef_param_2]; ; CHECK-NEXT: and.b32 %r5, %r1, %r3; ; CHECK-NEXT: and.b32 %r6, %r2, %r4; -; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v4i16_undef_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [out_v4i16_undef_param_1]; ; CHECK-NEXT: xor.b32 %r9, %r4, -65536; ; CHECK-NEXT: xor.b32 %r10, %r3, -1; ; CHECK-NEXT: and.b32 %r11, %r7, %r10; @@ -233,11 +233,11 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [out_v2i32_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [out_v2i32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2]; ; CHECK-NEXT: and.b32 %r5, %r1, %r3; ; CHECK-NEXT: and.b32 %r6, %r2, %r4; -; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [out_v2i32_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1]; ; CHECK-NEXT: not.b32 %r9, %r4; ; CHECK-NEXT: not.b32 %r10, %r3; ; CHECK-NEXT: and.b32 %r11, %r7, %r10; @@ -259,10 +259,10 @@ define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwin ; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [out_v1i64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [out_v1i64_param_2]; +; CHECK-NEXT: ld.param.b64 %rd1, [out_v1i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [out_v1i64_param_2]; ; CHECK-NEXT: and.b64 %rd3, %rd1, %rd2; -; CHECK-NEXT: ld.param.u64 %rd4, [out_v1i64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd4, [out_v1i64_param_1]; ; CHECK-NEXT: not.b64 %rd5, %rd2; ; CHECK-NEXT: and.b64 %rd6, %rd4, %rd5; ; CHECK-NEXT: or.b64 %rd7, %rd3, %rd6; @@ -285,13 +285,13 @@ define <16 x i8> @out_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v16i8_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v16i8_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v16i8_param_2]; ; CHECK-NEXT: and.b32 %r9, %r1, %r5; ; CHECK-NEXT: and.b32 %r10, %r2, %r6; ; CHECK-NEXT: and.b32 %r11, %r3, %r7; ; CHECK-NEXT: and.b32 %r12, %r4, %r8; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v16i8_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v16i8_param_1]; ; CHECK-NEXT: xor.b32 %r17, %r8, -1; ; CHECK-NEXT: xor.b32 %r18, %r7, -1; ; CHECK-NEXT: xor.b32 %r19, %r6, -1; @@ -319,13 +319,13 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v8i16_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v8i16_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v8i16_param_2]; ; CHECK-NEXT: and.b32 %r9, %r1, %r5; ; CHECK-NEXT: and.b32 %r10, %r2, %r6; ; CHECK-NEXT: and.b32 %r11, %r3, %r7; ; CHECK-NEXT: and.b32 %r12, %r4, %r8; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v8i16_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v8i16_param_1]; ; CHECK-NEXT: xor.b32 %r17, %r8, -1; ; CHECK-NEXT: xor.b32 %r18, %r7, -1; ; CHECK-NEXT: xor.b32 %r19, %r6, -1; @@ -353,13 +353,13 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin ; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2]; ; CHECK-NEXT: and.b32 %r9, %r1, %r5; ; CHECK-NEXT: and.b32 %r10, %r2, %r6; ; CHECK-NEXT: and.b32 %r11, %r3, %r7; ; CHECK-NEXT: and.b32 %r12, %r4, %r8; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1]; ; CHECK-NEXT: not.b32 %r17, %r8; ; CHECK-NEXT: not.b32 %r18, %r7; ; CHECK-NEXT: not.b32 %r19, %r6; @@ -387,13 +387,13 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n ; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2]; ; CHECK-NEXT: and.b32 %r9, %r3, %r7; ; CHECK-NEXT: and.b32 %r10, %r1, %r5; ; CHECK-NEXT: and.b32 %r11, %r2, %r6; ; CHECK-NEXT: and.b32 %r12, %r4, %r8; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1]; ; CHECK-NEXT: not.b32 %r17, %r8; ; CHECK-NEXT: not.b32 %r18, %r6; ; CHECK-NEXT: not.b32 %r19, %r5; @@ -418,11 +418,11 @@ define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwin ; CHECK-NEXT: .reg .b64 %rd<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [out_v2i64_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [out_v2i64_param_2]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2]; ; CHECK-NEXT: and.b64 %rd5, %rd1, %rd3; ; CHECK-NEXT: and.b64 %rd6, %rd2, %rd4; -; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [out_v2i64_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1]; ; CHECK-NEXT: not.b64 %rd9, %rd4; ; CHECK-NEXT: not.b64 %rd10, %rd3; ; CHECK-NEXT: and.b64 %rd11, %rd7, %rd10; @@ -452,10 +452,10 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u8 %rs1, [in_v1i8_param_0]; -; CHECK-NEXT: ld.param.u8 %rs2, [in_v1i8_param_1]; +; CHECK-NEXT: ld.param.b8 %rs1, [in_v1i8_param_0]; +; CHECK-NEXT: ld.param.b8 %rs2, [in_v1i8_param_1]; ; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: ld.param.u8 %rs4, [in_v1i8_param_2]; +; CHECK-NEXT: ld.param.b8 %rs4, [in_v1i8_param_2]; ; CHECK-NEXT: and.b16 %rs5, %rs3, %rs4; ; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; ; CHECK-NEXT: st.param.b8 [func_retval0], %rs6; @@ -476,10 +476,10 @@ define <1 x i16> @in_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind ; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u16 %rs1, [in_v1i16_param_0]; -; CHECK-NEXT: ld.param.u16 %rs2, [in_v1i16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs1, [in_v1i16_param_0]; +; CHECK-NEXT: ld.param.b16 %rs2, [in_v1i16_param_1]; ; CHECK-NEXT: xor.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: ld.param.u16 %rs4, [in_v1i16_param_2]; +; CHECK-NEXT: ld.param.b16 %rs4, [in_v1i16_param_2]; ; CHECK-NEXT: and.b16 %rs5, %rs3, %rs4; ; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; ; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; @@ -500,10 +500,10 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [in_v4i8_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [in_v4i8_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [in_v4i8_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [in_v4i8_param_1]; ; CHECK-NEXT: xor.b32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.u32 %r4, [in_v4i8_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [in_v4i8_param_2]; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; ; CHECK-NEXT: xor.b32 %r6, %r5, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; @@ -520,10 +520,10 @@ define <2 x i16> @in_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [in_v2i16_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [in_v2i16_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [in_v2i16_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [in_v2i16_param_1]; ; CHECK-NEXT: xor.b32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.u32 %r4, [in_v2i16_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [in_v2i16_param_2]; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; ; CHECK-NEXT: xor.b32 %r6, %r5, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; @@ -540,10 +540,10 @@ define <1 x i32> @in_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u32 %r1, [in_v1i32_param_0]; -; CHECK-NEXT: ld.param.u32 %r2, [in_v1i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r1, [in_v1i32_param_0]; +; CHECK-NEXT: ld.param.b32 %r2, [in_v1i32_param_1]; ; CHECK-NEXT: xor.b32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.u32 %r4, [in_v1i32_param_2]; +; CHECK-NEXT: ld.param.b32 %r4, [in_v1i32_param_2]; ; CHECK-NEXT: and.b32 %r5, %r3, %r4; ; CHECK-NEXT: xor.b32 %r6, %r5, %r2; ; CHECK-NEXT: st.param.b32 [func_retval0], %r6; @@ -564,9 +564,9 @@ define <8 x i8> @in_v8i8(<8 x i8> %x, <8 x i8> %y, <8 x i8> %mask) nounwind { ; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v8i8_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [in_v8i8_param_1]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [in_v8i8_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [in_v8i8_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [in_v8i8_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [in_v8i8_param_2]; ; CHECK-NEXT: xor.b32 %r7, %r2, %r4; ; CHECK-NEXT: and.b32 %r8, %r7, %r6; ; CHECK-NEXT: xor.b32 %r9, %r8, %r4; @@ -587,9 +587,9 @@ define <4 x i16> @in_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v4i16_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [in_v4i16_param_1]; -; CHECK-NEXT: ld.param.v2.u32 {%r5, %r6}, [in_v4i16_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [in_v4i16_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [in_v4i16_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [in_v4i16_param_2]; ; CHECK-NEXT: xor.b32 %r7, %r2, %r4; ; CHECK-NEXT: and.b32 %r8, %r7, %r6; ; CHECK-NEXT: xor.b32 %r9, %r8, %r4; @@ -610,11 +610,11 @@ define <2 x i32> @in_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u32 {%r1, %r2}, [in_v2i32_param_0]; -; CHECK-NEXT: ld.param.v2.u32 {%r3, %r4}, [in_v2i32_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [in_v2i32_param_0]; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [in_v2i32_param_1]; ; CHECK-NEXT: xor.b32 %r5, %r2, %r4; ; CHECK-NEXT: xor.b32 %r6, %r1, %r3; -; CHECK-NEXT: ld.param.v2.u32 {%r7, %r8}, [in_v2i32_param_2]; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [in_v2i32_param_2]; ; CHECK-NEXT: and.b32 %r9, %r6, %r7; ; CHECK-NEXT: and.b32 %r10, %r5, %r8; ; CHECK-NEXT: xor.b32 %r11, %r10, %r4; @@ -633,10 +633,10 @@ define <1 x i64> @in_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [in_v1i64_param_0]; -; CHECK-NEXT: ld.param.u64 %rd2, [in_v1i64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd1, [in_v1i64_param_0]; +; CHECK-NEXT: ld.param.b64 %rd2, [in_v1i64_param_1]; ; CHECK-NEXT: xor.b64 %rd3, %rd1, %rd2; -; CHECK-NEXT: ld.param.u64 %rd4, [in_v1i64_param_2]; +; CHECK-NEXT: ld.param.b64 %rd4, [in_v1i64_param_2]; ; CHECK-NEXT: and.b64 %rd5, %rd3, %rd4; ; CHECK-NEXT: xor.b64 %rd6, %rd5, %rd2; ; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; @@ -657,13 +657,13 @@ define <16 x i8> @in_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<25>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [in_v16i8_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [in_v16i8_param_1]; ; CHECK-NEXT: xor.b32 %r9, %r4, %r8; ; CHECK-NEXT: xor.b32 %r10, %r3, %r7; ; CHECK-NEXT: xor.b32 %r11, %r2, %r6; ; CHECK-NEXT: xor.b32 %r12, %r1, %r5; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [in_v16i8_param_2]; ; CHECK-NEXT: and.b32 %r17, %r12, %r13; ; CHECK-NEXT: and.b32 %r18, %r11, %r14; ; CHECK-NEXT: and.b32 %r19, %r10, %r15; @@ -686,13 +686,13 @@ define <8 x i16> @in_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<25>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v8i16_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v8i16_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [in_v8i16_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [in_v8i16_param_1]; ; CHECK-NEXT: xor.b32 %r9, %r4, %r8; ; CHECK-NEXT: xor.b32 %r10, %r3, %r7; ; CHECK-NEXT: xor.b32 %r11, %r2, %r6; ; CHECK-NEXT: xor.b32 %r12, %r1, %r5; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v8i16_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [in_v8i16_param_2]; ; CHECK-NEXT: and.b32 %r17, %r12, %r13; ; CHECK-NEXT: and.b32 %r18, %r11, %r14; ; CHECK-NEXT: and.b32 %r19, %r10, %r15; @@ -715,13 +715,13 @@ define <4 x i32> @in_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind ; CHECK-NEXT: .reg .b32 %r<25>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v4.u32 {%r1, %r2, %r3, %r4}, [in_v4i32_param_0]; -; CHECK-NEXT: ld.param.v4.u32 {%r5, %r6, %r7, %r8}, [in_v4i32_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [in_v4i32_param_0]; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [in_v4i32_param_1]; ; CHECK-NEXT: xor.b32 %r9, %r4, %r8; ; CHECK-NEXT: xor.b32 %r10, %r3, %r7; ; CHECK-NEXT: xor.b32 %r11, %r2, %r6; ; CHECK-NEXT: xor.b32 %r12, %r1, %r5; -; CHECK-NEXT: ld.param.v4.u32 {%r13, %r14, %r15, %r16}, [in_v4i32_param_2]; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [in_v4i32_param_2]; ; CHECK-NEXT: and.b32 %r17, %r12, %r13; ; CHECK-NEXT: and.b32 %r18, %r11, %r14; ; CHECK-NEXT: and.b32 %r19, %r10, %r15; @@ -744,11 +744,11 @@ define <2 x i64> @in_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind ; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [in_v2i64_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [in_v2i64_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [in_v2i64_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [in_v2i64_param_1]; ; CHECK-NEXT: xor.b64 %rd5, %rd2, %rd4; ; CHECK-NEXT: xor.b64 %rd6, %rd1, %rd3; -; CHECK-NEXT: ld.param.v2.u64 {%rd7, %rd8}, [in_v2i64_param_2]; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [in_v2i64_param_2]; ; CHECK-NEXT: and.b64 %rd9, %rd6, %rd7; ; CHECK-NEXT: and.b64 %rd10, %rd5, %rd8; ; CHECK-NEXT: xor.b64 %rd11, %rd10, %rd4; diff --git a/llvm/test/CodeGen/NVPTX/vaargs.ll b/llvm/test/CodeGen/NVPTX/vaargs.ll index 465e2a6a60eb..0cd0d29294c3 100644 --- a/llvm/test/CodeGen/NVPTX/vaargs.ll +++ b/llvm/test/CodeGen/NVPTX/vaargs.ll @@ -17,56 +17,56 @@ entry: ; Test va_start ; CHECK: .param .align 8 .b8 foo_vararg[] ; CHECK: mov.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], foo_vararg; -; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR]]; +; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR]]; call void @llvm.va_start(ptr %al) ; Test va_copy() -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; -; CHECK-NEXT: st.u[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: st.b[[BITS]] [%SP+{{[0-9]+}}], [[VA_PTR]]; call void @llvm.va_copy(ptr %al2, ptr %al) ; Test va_arg(ap, int32_t) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 3; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -4; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 4; -; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; -; CHECK-NEXT: ld.local.u32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; +; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: ld.local.b32 %r{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %0 = va_arg ptr %al, i32 ; Test va_arg(ap, int64_t) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; -; CHECK-NEXT: ld.local.u64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; +; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: ld.local.b64 %rd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %1 = va_arg ptr %al, i64 ; Test va_arg(ap, double) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_TMP:%(r|rd)[0-9]+]], [[VA_PTR]], 7; ; CHECK-NEXT: and.b[[BITS]] [[VA_PTR_ALIGN:%(r|rd)[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK-NEXT: add.s[[BITS]] [[VA_PTR_NEXT:%(r|rd)[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; -; CHECK-NEXT: ld.local.f64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; +; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: ld.local.b64 %fd{{[0-9]+}}, [[[VA_PTR_ALIGN]]]; %2 = va_arg ptr %al, double ; Test va_arg(ap, ptr) -; CHECK-NEXT: ld.u[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; +; CHECK-NEXT: ld.b[[BITS]] [[VA_PTR:%(r|rd)[0-9]+]], [%SP]; ; CHECK32-NEXT: add.s32 [[VA_PTR_TMP:%r[0-9]+]], [[VA_PTR]], 3; ; CHECK64-NEXT: add.s64 [[VA_PTR_TMP:%rd[0-9]+]], [[VA_PTR]], 7; ; CHECK32-NEXT: and.b32 [[VA_PTR_ALIGN:%r[0-9]+]], [[VA_PTR_TMP]], -4; ; CHECK64-NEXT: and.b64 [[VA_PTR_ALIGN:%rd[0-9]+]], [[VA_PTR_TMP]], -8; ; CHECK32-NEXT: add.s32 [[VA_PTR_NEXT:%r[0-9]+]], [[VA_PTR_ALIGN]], 4; ; CHECK64-NEXT: add.s64 [[VA_PTR_NEXT:%rd[0-9]+]], [[VA_PTR_ALIGN]], 8; -; CHECK-NEXT: st.u[[BITS]] [%SP], [[VA_PTR_NEXT]]; -; CHECK-NEXT: ld.local.u[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]]; +; CHECK-NEXT: st.b[[BITS]] [%SP], [[VA_PTR_NEXT]]; +; CHECK-NEXT: ld.local.b[[BITS]] %{{(r|rd)[0-9]+}}, [[[VA_PTR_ALIGN]]]; %3 = va_arg ptr %al, ptr %call = call i32 @bar(i32 %a, i32 %0, i64 %1, double %2, ptr %3) @@ -82,18 +82,18 @@ define i32 @test_foo(i32 %i, i64 %l, double %d, ptr %p) { ; Test indirect variadic function call. ; Load arguments to temporary variables -; CHECK32: ld.param.u32 [[ARG_VOID_PTR:%r[0-9]+]], [test_foo_param_3]; -; CHECK64: ld.param.u64 [[ARG_VOID_PTR:%rd[0-9]+]], [test_foo_param_3]; -; CHECK-NEXT: ld.param.f64 [[ARG_DOUBLE:%fd[0-9]+]], [test_foo_param_2]; -; CHECK-NEXT: ld.param.u64 [[ARG_I64:%rd[0-9]+]], [test_foo_param_1]; -; CHECK-NEXT: ld.param.u32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0]; +; CHECK32: ld.param.b32 [[ARG_VOID_PTR:%r[0-9]+]], [test_foo_param_3]; +; CHECK64: ld.param.b64 [[ARG_VOID_PTR:%rd[0-9]+]], [test_foo_param_3]; +; CHECK-NEXT: ld.param.b64 [[ARG_DOUBLE:%fd[0-9]+]], [test_foo_param_2]; +; CHECK-NEXT: ld.param.b64 [[ARG_I64:%rd[0-9]+]], [test_foo_param_1]; +; CHECK-NEXT: ld.param.b32 [[ARG_I32:%r[0-9]+]], [test_foo_param_0]; ; Store arguments to an array ; CHECK32: .param .align 8 .b8 param1[28]; ; CHECK64: .param .align 8 .b8 param1[32]; ; CHECK-NEXT: st.param.b32 [param1], [[ARG_I32]]; ; CHECK-NEXT: st.param.b64 [param1+8], [[ARG_I64]]; -; CHECK-NEXT: st.param.f64 [param1+16], [[ARG_DOUBLE]]; +; CHECK-NEXT: st.param.b64 [param1+16], [[ARG_DOUBLE]]; ; CHECK-NEXT: st.param.b[[BITS]] [param1+24], [[ARG_VOID_PTR]]; ; CHECK-NEXT: .param .b32 retval0; ; CHECK-NEXT: prototype_1 : .callprototype (.param .b32 _) _ (.param .b32 _, .param .align 8 .b8 _[] diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 9da361455a65..3235587f3d56 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -16,29 +16,29 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: .reg .b64 %fd<7>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry -; CHECK-PTX-NEXT: ld.param.u32 %r1, [variadics1_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics1_param_1]; -; CHECK-PTX-NEXT: ld.u32 %r2, [%rd1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics1_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics1_param_1]; +; CHECK-PTX-NEXT: ld.b32 %r2, [%rd1]; ; CHECK-PTX-NEXT: add.s32 %r3, %r1, %r2; -; CHECK-PTX-NEXT: ld.u32 %r4, [%rd1+4]; +; CHECK-PTX-NEXT: ld.b32 %r4, [%rd1+4]; ; CHECK-PTX-NEXT: add.s32 %r5, %r3, %r4; -; CHECK-PTX-NEXT: ld.u32 %r6, [%rd1+8]; +; CHECK-PTX-NEXT: ld.b32 %r6, [%rd1+8]; ; CHECK-PTX-NEXT: add.s32 %r7, %r5, %r6; ; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 19; ; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; -; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3]; +; CHECK-PTX-NEXT: ld.b64 %rd4, [%rd3]; ; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r7; ; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4; ; CHECK-PTX-NEXT: cvt.u32.u64 %r8, %rd6; ; CHECK-PTX-NEXT: add.s64 %rd7, %rd3, 15; ; CHECK-PTX-NEXT: and.b64 %rd8, %rd7, -8; -; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd8]; +; CHECK-PTX-NEXT: ld.b64 %fd1, [%rd8]; ; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r8; ; CHECK-PTX-NEXT: add.rn.f64 %fd3, %fd2, %fd1; ; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r9, %fd3; ; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, 15; ; CHECK-PTX-NEXT: and.b64 %rd10, %rd9, -8; -; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd10]; +; CHECK-PTX-NEXT: ld.b64 %fd4, [%rd10]; ; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r9; ; CHECK-PTX-NEXT: add.rn.f64 %fd6, %fd5, %fd4; ; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd6; @@ -112,14 +112,14 @@ define dso_local i32 @foo() { ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot1; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: mov.b64 %rd1, 4294967297; -; CHECK-PTX-NEXT: st.u64 [%SP], %rd1; +; CHECK-PTX-NEXT: st.b64 [%SP], %rd1; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; -; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; +; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1; ; CHECK-PTX-NEXT: mov.b64 %rd2, 1; -; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd2; +; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd2; ; CHECK-PTX-NEXT: mov.b64 %rd3, 4607182418800017408; -; CHECK-PTX-NEXT: st.u64 [%SP+24], %rd3; -; CHECK-PTX-NEXT: st.u64 [%SP+32], %rd3; +; CHECK-PTX-NEXT: st.b64 [%SP+24], %rd3; +; CHECK-PTX-NEXT: st.b64 [%SP+32], %rd3; ; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 0; ; CHECK-PTX-NEXT: { // callseq 0, 0 ; CHECK-PTX-NEXT: .param .b32 param0; @@ -157,20 +157,20 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot2; -; CHECK-PTX-NEXT: ld.param.u32 %r1, [variadics2_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics2_param_1]; +; CHECK-PTX-NEXT: ld.param.b32 %r1, [variadics2_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics2_param_1]; ; CHECK-PTX-NEXT: add.u64 %rd3, %SPL, 0; ; CHECK-PTX-NEXT: add.s64 %rd4, %rd1, 7; ; CHECK-PTX-NEXT: and.b64 %rd5, %rd4, -8; -; CHECK-PTX-NEXT: ld.u32 %r2, [%rd5]; +; CHECK-PTX-NEXT: ld.b32 %r2, [%rd5]; ; CHECK-PTX-NEXT: ld.s8 %r3, [%rd5+4]; -; CHECK-PTX-NEXT: ld.u8 %rs1, [%rd5+7]; -; CHECK-PTX-NEXT: st.local.u8 [%rd3+2], %rs1; -; CHECK-PTX-NEXT: ld.u8 %rs2, [%rd5+6]; -; CHECK-PTX-NEXT: st.local.u8 [%rd3+1], %rs2; -; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd5+5]; -; CHECK-PTX-NEXT: st.local.u8 [%rd3], %rs3; -; CHECK-PTX-NEXT: ld.u64 %rd6, [%rd5+8]; +; CHECK-PTX-NEXT: ld.b8 %rs1, [%rd5+7]; +; CHECK-PTX-NEXT: st.local.b8 [%rd3+2], %rs1; +; CHECK-PTX-NEXT: ld.b8 %rs2, [%rd5+6]; +; CHECK-PTX-NEXT: st.local.b8 [%rd3+1], %rs2; +; CHECK-PTX-NEXT: ld.b8 %rs3, [%rd5+5]; +; CHECK-PTX-NEXT: st.local.b8 [%rd3], %rs3; +; CHECK-PTX-NEXT: ld.b64 %rd6, [%rd5+8]; ; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; ; CHECK-PTX-NEXT: cvt.u64.u32 %rd7, %r5; @@ -220,21 +220,21 @@ define dso_local i32 @bar() { ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot3; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; -; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [__const_$_bar_$_s1+7]; +; CHECK-PTX-NEXT: ld.global.nc.b8 %rs1, [__const_$_bar_$_s1+7]; ; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1; -; CHECK-PTX-NEXT: st.local.u8 [%rd2+2], %rs2; -; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [__const_$_bar_$_s1+6]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+2], %rs2; +; CHECK-PTX-NEXT: ld.global.nc.b8 %rs3, [__const_$_bar_$_s1+6]; ; CHECK-PTX-NEXT: cvt.u16.u8 %rs4, %rs3; -; CHECK-PTX-NEXT: st.local.u8 [%rd2+1], %rs4; -; CHECK-PTX-NEXT: ld.global.nc.u8 %rs5, [__const_$_bar_$_s1+5]; +; CHECK-PTX-NEXT: st.local.b8 [%rd2+1], %rs4; +; CHECK-PTX-NEXT: ld.global.nc.b8 %rs5, [__const_$_bar_$_s1+5]; ; CHECK-PTX-NEXT: cvt.u16.u8 %rs6, %rs5; -; CHECK-PTX-NEXT: st.local.u8 [%rd2], %rs6; +; CHECK-PTX-NEXT: st.local.b8 [%rd2], %rs6; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; -; CHECK-PTX-NEXT: st.u32 [%SP+8], %r1; +; CHECK-PTX-NEXT: st.b32 [%SP+8], %r1; ; CHECK-PTX-NEXT: mov.b16 %rs7, 1; -; CHECK-PTX-NEXT: st.u8 [%SP+12], %rs7; +; CHECK-PTX-NEXT: st.b8 [%SP+12], %rs7; ; CHECK-PTX-NEXT: mov.b64 %rd3, 1; -; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd3; +; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd3; ; CHECK-PTX-NEXT: add.u64 %rd4, %SP, 8; ; CHECK-PTX-NEXT: { // callseq 1, 0 ; CHECK-PTX-NEXT: .param .b32 param0; @@ -269,10 +269,10 @@ define dso_local i32 @variadics3(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: .reg .b64 %rd<4>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry -; CHECK-PTX-NEXT: ld.param.u64 %rd1, [variadics3_param_1]; +; CHECK-PTX-NEXT: ld.param.b64 %rd1, [variadics3_param_1]; ; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 15; ; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -16; -; CHECK-PTX-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd3]; +; CHECK-PTX-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd3]; ; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2; ; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3; ; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4; @@ -311,7 +311,7 @@ define dso_local i32 @baz() { ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot5; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: mov.b32 %r1, 1; -; CHECK-PTX-NEXT: st.v4.u32 [%SP], {%r1, %r1, %r1, %r1}; +; CHECK-PTX-NEXT: st.v4.b32 [%SP], {%r1, %r1, %r1, %r1}; ; CHECK-PTX-NEXT: add.u64 %rd1, %SP, 0; ; CHECK-PTX-NEXT: { // callseq 2, 0 ; CHECK-PTX-NEXT: .param .b32 param0; @@ -341,12 +341,12 @@ define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, . ; CHECK-PTX-NEXT: .reg .b64 %rd<10>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry -; CHECK-PTX-NEXT: ld.param.u64 %rd2, [variadics4_param_1]; +; CHECK-PTX-NEXT: ld.param.b64 %rd2, [variadics4_param_1]; ; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 7; ; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; -; CHECK-PTX-NEXT: ld.u64 %rd5, [%rd4]; -; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd7, [variadics4_param_0+8]; +; CHECK-PTX-NEXT: ld.b64 %rd5, [%rd4]; +; CHECK-PTX-NEXT: ld.param.b64 %rd6, [variadics4_param_0]; +; CHECK-PTX-NEXT: ld.param.b64 %rd7, [variadics4_param_0+8]; ; CHECK-PTX-NEXT: add.s64 %rd8, %rd6, %rd7; ; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, %rd5; ; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd9; @@ -385,14 +385,14 @@ define dso_local void @qux() { ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot7; ; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL; ; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0; -; CHECK-PTX-NEXT: ld.global.nc.u64 %rd3, [__const_$_qux_$_s+8]; -; CHECK-PTX-NEXT: st.local.u64 [%rd2+8], %rd3; -; CHECK-PTX-NEXT: ld.global.nc.u64 %rd4, [__const_$_qux_$_s]; -; CHECK-PTX-NEXT: st.local.u64 [%rd2], %rd4; +; CHECK-PTX-NEXT: ld.global.nc.b64 %rd3, [__const_$_qux_$_s+8]; +; CHECK-PTX-NEXT: st.local.b64 [%rd2+8], %rd3; +; CHECK-PTX-NEXT: ld.global.nc.b64 %rd4, [__const_$_qux_$_s]; +; CHECK-PTX-NEXT: st.local.b64 [%rd2], %rd4; ; CHECK-PTX-NEXT: mov.b64 %rd5, 1; -; CHECK-PTX-NEXT: st.u64 [%SP+16], %rd5; -; CHECK-PTX-NEXT: ld.local.u64 %rd6, [%rd2]; -; CHECK-PTX-NEXT: ld.local.u64 %rd7, [%rd2+8]; +; CHECK-PTX-NEXT: st.b64 [%SP+16], %rd5; +; CHECK-PTX-NEXT: ld.local.b64 %rd6, [%rd2]; +; CHECK-PTX-NEXT: ld.local.b64 %rd7, [%rd2+8]; ; CHECK-PTX-NEXT: add.u64 %rd8, %SP, 16; ; CHECK-PTX-NEXT: { // callseq 3, 0 ; CHECK-PTX-NEXT: .param .align 8 .b8 param0[16]; diff --git a/llvm/test/CodeGen/NVPTX/vec-param-load.ll b/llvm/test/CodeGen/NVPTX/vec-param-load.ll index 5dea424c7dcc..8710d58ce6e9 100644 --- a/llvm/test/CodeGen/NVPTX/vec-param-load.ll +++ b/llvm/test/CodeGen/NVPTX/vec-param-load.ll @@ -5,40 +5,40 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 define <16 x float> @test_v16f32(<16 x float> %a) { ; CHECK-LABEL: test_v16f32( -; CHECK-DAG: ld.param.v4.f32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; -; CHECK-DAG: ld.param.v4.f32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; -; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; -; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; -; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} -; CHECK-DAG: st.param.v4.f32 [func_retval0+32], {[[V_8_11]]} -; CHECK-DAG: st.param.v4.f32 [func_retval0+48], {[[V_12_15]]} +; CHECK-DAG: ld.param.v4.b32 {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48]; +; CHECK-DAG: ld.param.v4.b32 {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32]; +; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16]; +; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0]; +; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0+32], {[[V_8_11]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0+48], {[[V_12_15]]} ; CHECK: ret; ret <16 x float> %a } define <8 x float> @test_v8f32(<8 x float> %a) { ; CHECK-LABEL: test_v8f32( -; CHECK-DAG: ld.param.v4.f32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; -; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; -; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]} -; CHECK-DAG: st.param.v4.f32 [func_retval0+16], {[[V_4_7]]} +; CHECK-DAG: ld.param.v4.b32 {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16]; +; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0]; +; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} +; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_4_7]]} ; CHECK: ret; ret <8 x float> %a } define <4 x float> @test_v4f32(<4 x float> %a) { ; CHECK-LABEL: test_v4f32( -; CHECK-DAG: ld.param.v4.f32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; -; CHECK-DAG: st.param.v4.f32 [func_retval0], {[[V_0_3]]} +; CHECK-DAG: ld.param.v4.b32 {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0]; +; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_3]]} ; CHECK: ret; ret <4 x float> %a } define <2 x float> @test_v2f32(<2 x float> %a) { ; CHECK-LABEL: test_v2f32( -; CHECK-DAG: ld.param.v2.f32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; -; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_3]]} +; CHECK-DAG: ld.param.v2.b32 {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0]; +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_3]]} ; CHECK: ret; ret <2 x float> %a } @@ -46,20 +46,20 @@ define <2 x float> @test_v2f32(<2 x float> %a) { ; Oddly shaped vectors should not load any extra elements. define <3 x float> @test_v3f32(<3 x float> %a) { ; CHECK-LABEL: test_v3f32( -; CHECK-DAG: ld.param.f32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; -; CHECK-DAG: ld.param.v2.f32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; -; CHECK-DAG: st.param.v2.f32 [func_retval0], {[[V_0_1]]} -; CHECK-DAG: st.param.f32 [func_retval0+8], [[V_2]] +; CHECK-DAG: ld.param.b32 [[V_2:%f[0-9]+]], [test_v3f32_param_0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0]; +; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[V_0_1]]} +; CHECK-DAG: st.param.b32 [func_retval0+8], [[V_2]] ; CHECK: ret; ret <3 x float> %a } define <8 x i64> @test_v8i64(<8 x i64> %a) { ; CHECK-LABEL: test_v8i64( -; CHECK-DAG: ld.param.v2.u64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; -; CHECK-DAG: ld.param.v2.u64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; -; CHECK-DAG: ld.param.v2.u64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; +; CHECK-DAG: ld.param.v2.b64 {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48]; +; CHECK-DAG: ld.param.v2.b64 {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32]; +; CHECK-DAG: ld.param.v2.b64 {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16]; +; CHECK-DAG: ld.param.v2.b64 {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0]; ; CHECK-DAG: st.param.v2.b64 [func_retval0], {[[V_0_1]]} ; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[V_2_3]]} ; CHECK-DAG: st.param.v2.b64 [func_retval0+32], {[[V_4_5]]} @@ -70,8 +70,8 @@ define <8 x i64> @test_v8i64(<8 x i64> %a) { define <16 x i16> @test_v16i16(<16 x i16> %a) { ; CHECK-LABEL: test_v16i16( -; CHECK-DAG: ld.param.v4.u32 {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; -; CHECK-DAG: ld.param.v4.u32 {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; +; CHECK-DAG: ld.param.v4.b32 {[[V_8_15:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16]; +; CHECK-DAG: ld.param.v4.b32 {[[V_0_7:(%r[0-9]+[, ]*){4}]]}, [test_v16i16_param_0]; ; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[V_0_7]]} ; CHECK-DAG: st.param.v4.b32 [func_retval0+16], {[[V_8_15]]} ; CHECK: ret; diff --git a/llvm/test/CodeGen/NVPTX/vec8.ll b/llvm/test/CodeGen/NVPTX/vec8.ll index 3a3dd8072abf..b44c084bd7b8 100644 --- a/llvm/test/CodeGen/NVPTX/vec8.ll +++ b/llvm/test/CodeGen/NVPTX/vec8.ll @@ -5,10 +5,10 @@ target triple = "nvptx-unknown-cuda" ; CHECK: .visible .func foo define void @foo(<8 x i8> %a, ptr %b) { -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0] -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9+]]], [foo_param_1] +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo_param_0] +; CHECK-DAG: ld.param.b64 %[[B:rd[0-9+]]], [foo_param_1] ; CHECK: add.s16 [[T:%rs[0-9+]]], -; CHECK: st.u8 [%[[B]]], [[T]]; +; CHECK: st.b8 [%[[B]]], [[T]]; %t0 = extractelement <8 x i8> %a, i32 1 %t1 = extractelement <8 x i8> %a, i32 6 %t = add i8 %t0, %t1 diff --git a/llvm/test/CodeGen/NVPTX/vector-args.ll b/llvm/test/CodeGen/NVPTX/vector-args.ll index bc1a138e25bd..192cd562d67b 100644 --- a/llvm/test/CodeGen/NVPTX/vector-args.ll +++ b/llvm/test/CodeGen/NVPTX/vector-args.ll @@ -4,7 +4,7 @@ define float @foo(<2 x float> %a) { ; CHECK: .func (.param .b32 func_retval0) foo ; CHECK: .param .align 8 .b8 foo_param_0[8] -; CHECK: ld.param.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: ld.param.v2.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = fmul <2 x float> %a, %a %t2 = extractelement <2 x float> %t1, i32 0 %t3 = extractelement <2 x float> %t1, i32 1 @@ -16,7 +16,7 @@ define float @foo(<2 x float> %a) { define float @bar(<4 x float> %a) { ; CHECK: .func (.param .b32 func_retval0) bar ; CHECK: .param .align 16 .b8 bar_param_0[16] -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: ld.param.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = fmul <4 x float> %a, %a %t2 = extractelement <4 x float> %t1, i32 0 %t3 = extractelement <4 x float> %t1, i32 1 @@ -28,8 +28,8 @@ define float @bar(<4 x float> %a) { define <4 x float> @baz(<4 x float> %a) { ; CHECK: .func (.param .align 16 .b8 func_retval0[16]) baz ; CHECK: .param .align 16 .b8 baz_param_0[16] -; CHECK: ld.param.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} -; CHECK: st.param.v4.f32 [func_retval0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: ld.param.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: st.param.v4.b32 [func_retval0], {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = fmul <4 x float> %a, %a ret <4 x float> %t1 } diff --git a/llvm/test/CodeGen/NVPTX/vector-call.ll b/llvm/test/CodeGen/NVPTX/vector-call.ll index 83439e7744fa..27063f833b7d 100644 --- a/llvm/test/CodeGen/NVPTX/vector-call.ll +++ b/llvm/test/CodeGen/NVPTX/vector-call.ll @@ -6,7 +6,7 @@ target triple = "nvptx-unknown-cuda" declare void @bar(<4 x i32>) ; CHECK-LABEL: .func foo( -; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; +; CHECK-DAG: ld.param.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK-DAG: st.param.v4.b32 [param0], {[[E0]], [[E1]], [[E2]], [[E3]]}; ; CHECK: call.uni @@ -17,8 +17,8 @@ define void @foo(<4 x i32> %a) { } ; CHECK-LABEL: .func foo3( -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; +; CHECK-DAG: ld.param.v2.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; +; CHECK-DAG: ld.param.b32 [[E2:%r[0-9]+]], [foo3_param_0+8]; ; CHECK: .param .align 16 .b8 param0[16]; ; CHECK-DAG: st.param.v2.b32 [param0], {[[E0]], [[E1]]}; ; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; diff --git a/llvm/test/CodeGen/NVPTX/vector-compare.ll b/llvm/test/CodeGen/NVPTX/vector-compare.ll index 4a1335f13b22..0e63ee96932d 100644 --- a/llvm/test/CodeGen/NVPTX/vector-compare.ll +++ b/llvm/test/CodeGen/NVPTX/vector-compare.ll @@ -9,9 +9,9 @@ ; CHECK-LABEL: .visible .func foo( define void @foo(ptr %a, ptr %b, ptr %r1, ptr %r2) { -; CHECK: ld.v2.u32 +; CHECK: ld.v2.b32 %aval = load <2 x i32>, ptr %a -; CHECK: ld.v2.u32 +; CHECK: ld.v2.b32 %bval = load <2 x i32>, ptr %b ; CHECK: setp.lt.s32 ; CHECK: setp.lt.s32 @@ -22,8 +22,8 @@ define void @foo(ptr %a, ptr %b, ptr %r1, ptr %r2) { ; CHECK: selp.b32 %r{{[0-9]+}}, 1, 0 %t1a = zext i1 %t1 to i32 %t2a = zext i1 %t2 to i32 -; CHECK: st.u32 -; CHECK: st.u32 +; CHECK: st.b32 +; CHECK: st.b32 store i32 %t1a, ptr %r1 store i32 %t2a, ptr %r2 ret void diff --git a/llvm/test/CodeGen/NVPTX/vector-loads.ll b/llvm/test/CodeGen/NVPTX/vector-loads.ll index d731985ae971..825a66ec04b5 100644 --- a/llvm/test/CodeGen/NVPTX/vector-loads.ll +++ b/llvm/test/CodeGen/NVPTX/vector-loads.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: foo define void @foo(ptr %a) { -; CHECK: ld.v2.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: ld.v2.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = load <2 x float>, ptr %a %t2 = fmul <2 x float> %t1, %t1 store <2 x float> %t2, ptr %a @@ -19,7 +19,7 @@ define void @foo(ptr %a) { ; CHECK-LABEL: foo2 define void @foo2(ptr %a) { -; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = load <4 x float>, ptr %a %t2 = fmul <4 x float> %t1, %t1 store <4 x float> %t2, ptr %a @@ -28,8 +28,8 @@ define void @foo2(ptr %a) { ; CHECK-LABEL: foo3 define void @foo3(ptr %a) { -; CHECK: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} -; CHECK-NEXT: ld.v4.f32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} +; CHECK-NEXT: ld.v4.b32 {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}} %t1 = load <8 x float>, ptr %a %t2 = fmul <8 x float> %t1, %t1 store <8 x float> %t2, ptr %a @@ -40,7 +40,7 @@ define void @foo3(ptr %a) { ; CHECK-LABEL: foo4 define void @foo4(ptr %a) { -; CHECK: ld.v2.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK: ld.v2.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}} %t1 = load <2 x i32>, ptr %a %t2 = mul <2 x i32> %t1, %t1 store <2 x i32> %t2, ptr %a @@ -49,7 +49,7 @@ define void @foo4(ptr %a) { ; CHECK-LABEL: foo5 define void @foo5(ptr %a) { -; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} %t1 = load <4 x i32>, ptr %a %t2 = mul <4 x i32> %t1, %t1 store <4 x i32> %t2, ptr %a @@ -58,8 +58,8 @@ define void @foo5(ptr %a) { ; CHECK-LABEL: foo6 define void @foo6(ptr %a) { -; CHECK: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} -; CHECK-NEXT: ld.v4.u32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} +; CHECK-NEXT: ld.v4.b32 {%r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}, %r{{[0-9]+}}} %t1 = load <8 x i32>, ptr %a %t2 = mul <8 x i32> %t1, %t1 store <8 x i32> %t2, ptr %a @@ -86,7 +86,7 @@ define void @foo_complex(ptr nocapture readonly align 16 dereferenceable(1342177 %t11 = zext i32 %t10 to i64 %t20 = zext i32 %t2 to i64 %t27 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t9 -; CHECK: ld.v2.u8 +; CHECK: ld.v2.b8 %t28 = load i8, ptr %t27, align 2 %t31 = getelementptr inbounds [1024 x [131072 x i8]], ptr %alloc0, i64 0, i64 %t20, i64 %t11 %t32 = load i8, ptr %t31, align 1 @@ -114,8 +114,8 @@ define void @extv8f16_global_a16(ptr addrspace(1) noalias readonly align 16 %dst ; CHECK: cvt.f32.f16 %f{{.*}}, %rs ; CHECK: cvt.f32.f16 %f{{.*}}, %rs %ext = fpext <8 x half> %v to <8 x float> -; CHECK: st.global.v4.f32 -; CHECK: st.global.v4.f32 +; CHECK: st.global.v4.b32 +; CHECK: st.global.v4.b32 store <8 x float> %ext, ptr addrspace(1) %dst, align 16 ret void } @@ -140,8 +140,8 @@ define void @extv8f16_global_a4(ptr addrspace(1) noalias readonly align 16 %dst, ; CHECK: cvt.f32.f16 %f{{.*}}, %rs ; CHECK: cvt.f32.f16 %f{{.*}}, %rs %ext = fpext <8 x half> %v to <8 x float> -; CHECK: st.global.v4.f32 -; CHECK: st.global.v4.f32 +; CHECK: st.global.v4.b32 +; CHECK: st.global.v4.b32 store <8 x float> %ext, ptr addrspace(1) %dst, align 16 ret void } @@ -164,8 +164,8 @@ define void @extv8f16_generic_a16(ptr noalias readonly align 16 %dst, ptr noalia ; CHECK: cvt.f32.f16 %f{{.*}}, %rs ; CHECK: cvt.f32.f16 %f{{.*}}, %rs %ext = fpext <8 x half> %v to <8 x float> -; CHECK: st.v4.f32 -; CHECK: st.v4.f32 +; CHECK: st.v4.b32 +; CHECK: st.v4.b32 store <8 x float> %ext, ptr %dst, align 16 ret void } @@ -190,8 +190,8 @@ define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias ; CHECK: cvt.f32.f16 %f{{.*}}, %rs ; CHECK: cvt.f32.f16 %f{{.*}}, %rs %ext = fpext <8 x half> %v to <8 x float> -; CHECK: st.v4.f32 -; CHECK: st.v4.f32 +; CHECK: st.v4.b32 +; CHECK: st.v4.b32 store <8 x float> %ext, ptr %dst, align 16 ret void } diff --git a/llvm/test/CodeGen/NVPTX/vector-select.ll b/llvm/test/CodeGen/NVPTX/vector-select.ll index 90d7e24c7ea7..569da5e6628b 100644 --- a/llvm/test/CodeGen/NVPTX/vector-select.ll +++ b/llvm/test/CodeGen/NVPTX/vector-select.ll @@ -9,9 +9,9 @@ ; CHECK-LABEL: .visible .func foo( define void @foo(ptr addrspace(1) %def_a, ptr addrspace(1) %def_b, ptr addrspace(1) %def_c) { entry: -; CHECK: ld.global.v2.u32 -; CHECK: ld.global.v2.u32 -; CHECK: ld.global.v2.u32 +; CHECK: ld.global.v2.b32 +; CHECK: ld.global.v2.b32 +; CHECK: ld.global.v2.b32 %tmp4 = load <2 x i32>, ptr addrspace(1) %def_a %tmp6 = load <2 x i32>, ptr addrspace(1) %def_c %tmp8 = load <2 x i32>, ptr addrspace(1) %def_b @@ -21,7 +21,7 @@ entry: ; CHECK: selp.b32 ; CHECK: selp.b32 %cond = select <2 x i1> %0, <2 x i32> %tmp6, <2 x i32> %tmp8 -; CHECK: st.global.v2.u32 +; CHECK: st.global.v2.b32 store <2 x i32> %cond, ptr addrspace(1) %def_c ret void } diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll index cbcaf5fc3822..f3b101507008 100644 --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -2,28 +2,28 @@ ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} ; CHECK-LABEL: .visible .func foo1 -; CHECK: st.v2.f32 +; CHECK: st.v2.b32 define void @foo1(<2 x float> %val, ptr %ptr) { store <2 x float> %val, ptr %ptr ret void } ; CHECK-LABEL: .visible .func foo2 -; CHECK: st.v4.f32 +; CHECK: st.v4.b32 define void @foo2(<4 x float> %val, ptr %ptr) { store <4 x float> %val, ptr %ptr ret void } ; CHECK-LABEL: .visible .func foo3 -; CHECK: st.v2.u32 +; CHECK: st.v2.b32 define void @foo3(<2 x i32> %val, ptr %ptr) { store <2 x i32> %val, ptr %ptr ret void } ; CHECK-LABEL: .visible .func foo4 -; CHECK: st.v4.u32 +; CHECK: st.v4.b32 define void @foo4(<4 x i32> %val, ptr %ptr) { store <4 x i32> %val, ptr %ptr ret void diff --git a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll index ec9c38258c57..b77f69fd717f 100644 --- a/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll +++ b/llvm/test/CodeGen/NVPTX/vectorize-misaligned.ll @@ -4,10 +4,10 @@ target triple = "nvptx64-nvidia-cuda" ; CHECK-LABEL: test1 -; CHECK: ld.global.v2.f32 -; CHECK: ld.global.v2.f32 -; CHECK: st.global.v2.f32 -; CHECK: st.global.v2.f32 +; CHECK: ld.global.v2.b32 +; CHECK: ld.global.v2.b32 +; CHECK: st.global.v2.b32 +; CHECK: st.global.v2.b32 define void @test1(ptr addrspace(1) noalias align 8 %in, ptr addrspace(1) noalias align 8 %out) { %in.1 = getelementptr float, ptr addrspace(1) %in, i32 1 %in.2 = getelementptr float, ptr addrspace(1) %in, i32 2 diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll index 1fc945b364c9..2d8a17f7178a 100644 --- a/llvm/test/DebugInfo/NVPTX/debug-info.ll +++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll @@ -24,10 +24,10 @@ ; CHECK-DAG: .reg .b32 %r<6>; ; CHECK-DAG: .reg .b64 %rd<8>; ; CHECK: .loc [[DEBUG_INFO_CU:[0-9]+]] 5 0 -; CHECK: ld.param.u32 %r{{.+}}, [{{.+}}]; -; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; +; CHECK: ld.param.b32 %r{{.+}}, [{{.+}}]; +; CHECK: ld.param.b64 %rd{{.+}}, [{{.+}}]; ; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; -; CHECK: ld.param.u64 %rd{{.+}}, [{{.+}}]; +; CHECK: ld.param.b64 %rd{{.+}}, [{{.+}}]; ; CHECK: cvta.to.global.u64 %rd{{.+}}, %rd{{.+}}; ; CHECK: .loc [[BUILTUIN_VARS_H:[0-9]+]] 78 180 ; CHECK: mov.u32 %r{{.+}}, %ctaid.x; @@ -41,18 +41,18 @@ ; CHECK: setp.ge.s32 %p{{.+}}, %r{{.+}}, %r{{.+}}; ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7 ; CHECK: @%p{{.+}} bra [[BB:\$L__.+]]; -; CHECK: ld.param.f32 %f{{.+}}, [{{.+}}]; +; CHECK: ld.param.b32 %f{{.+}}, [{{.+}}]; ; CHECK: .loc [[DEBUG_INFO_CU]] 8 13 ; CHECK: mul.wide.u32 %rd{{.+}}, %r{{.+}}, 4; ; CHECK: add.s64 %rd{{.+}}, %rd{{.+}}, %rd{{.+}}; -; CHECK: ld.global.f32 %f{{.+}}, [%rd{{.+}}]; +; CHECK: ld.global.b32 %f{{.+}}, [%rd{{.+}}]; ; CHECK: .loc [[DEBUG_INFO_CU]] 8 19 ; CHECK: add.s64 %rd{{.+}}, %rd{{.+}}, %rd{{.+}}; -; CHECK: ld.global.f32 %f{{.+}}, [%rd{{.+}}]; +; CHECK: ld.global.b32 %f{{.+}}, [%rd{{.+}}]; ; CHECK: .loc [[DEBUG_INFO_CU]] 3 82 ; CHECK: fma.rn.f32 %f{{.+}}, %f{{.+}}, %f{{.+}}, %f{{.+}}; ; CHECK: .loc [[DEBUG_INFO_CU]] 3 78 -; CHECK: st.global.f32 [%rd{{.+}}], %f{{.+}}; +; CHECK: st.global.b32 [%rd{{.+}}], %f{{.+}}; ; CHECK: [[BB]]: ; CHECK: .loc [[DEBUG_INFO_CU]] 9 1 ; CHECK: ret; diff --git a/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll b/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll index 8ec573c2e9ea..bf5f33b03fce 100644 --- a/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll +++ b/llvm/test/Transforms/NaryReassociate/NVPTX/nary-slsr.ll @@ -17,7 +17,7 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" define void @nary_reassociate_after_slsr(i32 %a, i32 %b, i32 %c) { ; CHECK-LABEL: @nary_reassociate_after_slsr( ; PTX-LABEL: .visible .func nary_reassociate_after_slsr( -; PTX: ld.param.u32 [[b:%r[0-9]+]], [nary_reassociate_after_slsr_param_1]; +; PTX: ld.param.b32 [[b:%r[0-9]+]], [nary_reassociate_after_slsr_param_1]; %ab = add i32 %a, %b %abc = add i32 %ab, %c call void @foo(i32 %abc) diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll index 4474585bf9b0..11f9d7018a02 100644 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll @@ -70,10 +70,10 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) { ret void } ; PTX-LABEL: sum_of_array( -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132] ; TODO: GVN is unable to preserve the "inbounds" keyword on the first GEP. Need ; some infrastructure changes to enable such optimizations. @@ -134,10 +134,10 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) { ret void } ; PTX-LABEL: sum_of_array2( -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132] @@ -203,10 +203,10 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) { ret void } ; PTX-LABEL: sum_of_array3( -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132] @@ -268,10 +268,10 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) { ret void } ; PTX-LABEL: sum_of_array4( -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+4] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+128] -; PTX-DAG: ld.shared.f32 {{%f[0-9]+}}, [[[BASE_REG]]+132] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG:%(rd|r)[0-9]+]]] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+4] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+128] +; PTX-DAG: ld.shared.b32 {{%f[0-9]+}}, [[[BASE_REG]]+132] @@ -307,7 +307,7 @@ entry: %0 = sext i32 %xy to i64 %p0 = getelementptr inbounds float, ptr %input, i64 %0 %v0 = load float, ptr %p0, align 4 -; PTX: ld.f32 %f{{[0-9]+}}, [[[p0:%rd[0-9]+]]] +; PTX: ld.b32 %f{{[0-9]+}}, [[[p0:%rd[0-9]+]]] call void @use(float %v0) %y5 = add nsw i32 %y, 5 @@ -315,7 +315,7 @@ entry: %1 = sext i32 %xy5 to i64 %p1 = getelementptr inbounds float, ptr %input, i64 %1 %v1 = load float, ptr %p1, align 4 -; PTX: ld.f32 %f{{[0-9]+}}, [[[p0]]+20] +; PTX: ld.b32 %f{{[0-9]+}}, [[[p0]]+20] call void @use(float %v1) ret void diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll index edaeef8c87b6..11f2be7b763c 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/reassociate-geps-and-slsr.ll @@ -51,8 +51,8 @@ define void @slsr_after_reassociate_geps(ptr %arr, i32 %i) { ; CHECK-NEXT: ret void ; ; PTX-LABEL: .visible .func slsr_after_reassociate_geps( -; PTX: ld.param.u64 [[arr:%rd[0-9]+]], [slsr_after_reassociate_geps_param_0]; -; PTX: ld.param.u32 [[i:%r[0-9]+]], [slsr_after_reassociate_geps_param_1]; +; PTX: ld.param.b64 [[arr:%rd[0-9]+]], [slsr_after_reassociate_geps_param_0]; +; PTX: ld.param.b32 [[i:%r[0-9]+]], [slsr_after_reassociate_geps_param_1]; %i2 = shl nsw i32 %i, 1 %i3 = mul nsw i32 %i, 3 %i4 = shl nsw i32 %i, 2 @@ -62,28 +62,28 @@ define void @slsr_after_reassociate_geps(ptr %arr, i32 %i) { ; PTX: mul.wide.s32 [[i4:%rd[0-9]+]], [[i]], 4; ; PTX: add.s64 [[base1:%rd[0-9]+]], [[arr]], [[i4]]; %v1 = load float, ptr %p1, align 4 -; PTX: ld.f32 {{%f[0-9]+}}, [[[base1]]+20]; +; PTX: ld.b32 {{%f[0-9]+}}, [[[base1]]+20]; call void @foo(float %v1) %j2 = add nsw i32 %i2, 5 %p2 = getelementptr inbounds float, ptr %arr, i32 %j2 ; PTX: add.s64 [[base2:%rd[0-9]+]], [[base1]], [[i4]]; %v2 = load float, ptr %p2, align 4 -; PTX: ld.f32 {{%f[0-9]+}}, [[[base2]]+20]; +; PTX: ld.b32 {{%f[0-9]+}}, [[[base2]]+20]; call void @foo(float %v2) %j3 = add nsw i32 %i3, 5 %p3 = getelementptr inbounds float, ptr %arr, i32 %j3 ; PTX: add.s64 [[base3:%rd[0-9]+]], [[base2]], [[i4]]; %v3 = load float, ptr %p3, align 4 -; PTX: ld.f32 {{%f[0-9]+}}, [[[base3]]+20]; +; PTX: ld.b32 {{%f[0-9]+}}, [[[base3]]+20]; call void @foo(float %v3) %j4 = add nsw i32 %i4, 5 %p4 = getelementptr inbounds float, ptr %arr, i32 %j4 ; PTX: add.s64 [[base4:%rd[0-9]+]], [[base3]], [[i4]]; %v4 = load float, ptr %p4, align 4 -; PTX: ld.f32 {{%f[0-9]+}}, [[[base4]]+20]; +; PTX: ld.b32 {{%f[0-9]+}}, [[[base4]]+20]; call void @foo(float %v4) ret void diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll index 420e844b5103..6d086c3ea55b 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/speculative-slsr.ll @@ -14,8 +14,8 @@ target triple = "nvptx64-nvidia-cuda" define ptx_kernel void @foo(i32 %b, i32 %s) { ; CHECK-LABEL: .visible .entry foo( entry: -; CHECK: ld.param.u32 [[s:%r[0-9]+]], [foo_param_1]; -; CHECK: ld.param.u32 [[b:%r[0-9]+]], [foo_param_0]; +; CHECK: ld.param.b32 [[s:%r[0-9]+]], [foo_param_1]; +; CHECK: ld.param.b32 [[b:%r[0-9]+]], [foo_param_0]; %call = tail call zeroext i1 @cond(i32 0) br i1 %call, label %if.then, label %for.inc diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected index ad0b11ed6a80..b8779b9d54ea 100644 --- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected +++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/nvptx-basic.ll.expected @@ -10,10 +10,10 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [caller_St8x4_param_0+8]; -; CHECK-NEXT: ld.param.u64 %rd2, [caller_St8x4_param_0]; -; CHECK-NEXT: ld.param.u64 %rd3, [caller_St8x4_param_0+24]; -; CHECK-NEXT: ld.param.u64 %rd4, [caller_St8x4_param_0+16]; +; CHECK-NEXT: ld.param.b64 %rd1, [caller_St8x4_param_0+8]; +; CHECK-NEXT: ld.param.b64 %rd2, [caller_St8x4_param_0]; +; CHECK-NEXT: ld.param.b64 %rd3, [caller_St8x4_param_0+24]; +; CHECK-NEXT: ld.param.b64 %rd4, [caller_St8x4_param_0+16]; ; CHECK-NEXT: { // callseq 0, 0 ; CHECK-NEXT: .param .align 16 .b8 param0[32]; ; CHECK-NEXT: st.param.v2.b64 [param0], {%rd2, %rd1}; @@ -27,11 +27,11 @@ define dso_local void @caller_St8x4(ptr nocapture noundef readonly byval(%struct ; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [retval0]; ; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [retval0+16]; ; CHECK-NEXT: } // callseq 0 -; CHECK-NEXT: ld.param.u32 %r2, [caller_St8x4_param_1]; -; CHECK-NEXT: st.u64 [%r2], %rd5; -; CHECK-NEXT: st.u64 [%r2+8], %rd6; -; CHECK-NEXT: st.u64 [%r2+16], %rd7; -; CHECK-NEXT: st.u64 [%r2+24], %rd8; +; CHECK-NEXT: ld.param.b32 %r2, [caller_St8x4_param_1]; +; CHECK-NEXT: st.b64 [%r2], %rd5; +; CHECK-NEXT: st.b64 [%r2+8], %rd6; +; CHECK-NEXT: st.b64 [%r2+16], %rd7; +; CHECK-NEXT: st.b64 [%r2+24], %rd8; ; CHECK-NEXT: ret; %call = tail call fastcc [4 x i64] @callee_St8x4(ptr noundef nonnull byval(%struct.St8x4) align 8 %in) #2 %.fca.0.extract = extractvalue [4 x i64] %call, 0 @@ -56,8 +56,8 @@ define internal fastcc [4 x i64] @callee_St8x4(ptr nocapture noundef readonly by ; CHECK-NEXT: .reg .b64 %rd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.v2.u64 {%rd1, %rd2}, [callee_St8x4_param_0]; -; CHECK-NEXT: ld.param.v2.u64 {%rd3, %rd4}, [callee_St8x4_param_0+16]; +; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [callee_St8x4_param_0]; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [callee_St8x4_param_0+16]; ; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd1, %rd2}; ; CHECK-NEXT: st.param.v2.b64 [func_retval0+16], {%rd3, %rd4}; ; CHECK-NEXT: ret;