Emit .ptr, .address-space, and .align attributes for kernel args in CUDA (previously handled only for OpenCL). This allows for more vectorization opportunities if the PTX consumer is able to know about the pointer alignments. If no alignment is explicitly specified, .align 1 will be emitted to match the LLVM IR semantics in this case. PTX ISA doc - https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#kernel-parameter-attribute-ptr This is a rework of the original patch proposed in #79646 --------- Co-authored-by: Vandana <vandanak@nvidia.com>
43 lines
1.8 KiB
LLVM
43 lines
1.8 KiB
LLVM
; RUN: llc < %s -march=nvptx64 -mcpu=sm_60 | FileCheck %s
|
|
; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_60 | %ptxas-verify %}
|
|
|
|
%struct.Large = type { [16 x double] }
|
|
|
|
; CHECK-LABEL: .entry func_align(
|
|
; CHECK: .param .u64 .ptr .align 1 func_align_param_0
|
|
; CHECK: .param .u64 .ptr .align 2 func_align_param_1
|
|
; CHECK: .param .u64 .ptr .global .align 4 func_align_param_2
|
|
; CHECK: .param .u64 .ptr .shared .align 8 func_align_param_3
|
|
; CHECK: .param .u64 .ptr .const .align 16 func_align_param_4
|
|
; CHECK: .param .u64 .ptr .local .align 32 func_align_param_5
|
|
define void @func_align(ptr nocapture readonly align 1 %input,
|
|
ptr nocapture align 2 %out,
|
|
ptr addrspace(1) align 4 %global,
|
|
ptr addrspace(3) align 8 %shared,
|
|
ptr addrspace(4) align 16 %const,
|
|
ptr addrspace(5) align 32 %local) {
|
|
entry:
|
|
ret void
|
|
}
|
|
|
|
; CHECK-LABEL: .entry func_noalign(
|
|
; CHECK: .param .u64 .ptr .align 1 func_noalign_param_0
|
|
; CHECK: .param .u64 .ptr .align 1 func_noalign_param_1
|
|
; CHECK: .param .u64 .ptr .global .align 1 func_noalign_param_2
|
|
; CHECK: .param .u64 .ptr .shared .align 1 func_noalign_param_3
|
|
; CHECK: .param .u64 .ptr .const .align 1 func_noalign_param_4
|
|
; CHECK: .param .u64 .ptr .local .align 1 func_noalign_param_5
|
|
define void @func_noalign(ptr nocapture readonly %input,
|
|
ptr nocapture %out,
|
|
ptr addrspace(1) %global,
|
|
ptr addrspace(3) %shared,
|
|
ptr addrspace(4) %const,
|
|
ptr addrspace(5) %local) {
|
|
entry:
|
|
ret void
|
|
}
|
|
|
|
!nvvm.annotations = !{!0, !1}
|
|
!0 = !{ptr @func_align, !"kernel", i32 1}
|
|
!1 = !{ptr @func_noalign, !"kernel", i32 1}
|