Revamp the NVVMIntrRange pass making the following updates: - Use range attributes over range metadata. This is what instcombine has move to for ranges on intrinsics in https://github.com/llvm/llvm-project/pull/88776 and it seems a bit cleaner. - Consider the `!"maxntid{x,y,z}"` and `!"reqntid{x,y,z}"` function metadata when adding ranges for `tid` srge instrinsics. This can allow for smaller ranges and more optimization. - When range attributes are already present, use the intersection of the old and new range. This complements the metadata change by allowing ranges to be shrunk when an intrinsic is in a function which is inlined into a kernel with metadata. While we don't call this more then once yet, we should consider adding a second call after inlining, once this has had a chance to soak for a while and no issues have arisen. I've also re-enabled this pass in the TM, it was disabled years ago due to "numerical discrepancies" https://reviews.llvm.org/D96166. In our testing we haven't seen any issues with adding ranges to intrinsics, and I cannot find any further info about what issues were encountered.
89 lines
3.9 KiB
LLVM
89 lines
3.9 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5
|
|
; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s
|
|
|
|
define i32 @test_maxntid() {
|
|
; CHECK-LABEL: define i32 @test_maxntid(
|
|
; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
|
; CHECK-NEXT: [[TMP11:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP11]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP10]], [[TMP6]]
|
|
; CHECK-NEXT: ret i32 [[TMP5]]
|
|
;
|
|
%1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
%2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
|
%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
|
%4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
%5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
|
%6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
|
%7 = add i32 %1, %2
|
|
%8 = add i32 %7, %3
|
|
%9 = add i32 %8, %4
|
|
%10 = add i32 %9, %5
|
|
%11 = add i32 %10, %6
|
|
ret i32 %11
|
|
}
|
|
|
|
define i32 @test_reqntid() {
|
|
; CHECK-LABEL: define i32 @test_reqntid(
|
|
; CHECK-SAME: ) #[[ATTR0]] {
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
|
; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
|
; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
|
; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
|
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], [[TMP5]]
|
|
; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP2]]
|
|
; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP4]]
|
|
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]]
|
|
; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP6]]
|
|
; CHECK-NEXT: ret i32 [[TMP3]]
|
|
;
|
|
%1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
%2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
|
%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
|
%4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
%5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
|
%6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
|
%7 = add i32 %1, %2
|
|
%8 = add i32 %7, %3
|
|
%9 = add i32 %8, %4
|
|
%10 = add i32 %9, %5
|
|
%11 = add i32 %10, %6
|
|
ret i32 %5
|
|
}
|
|
|
|
;; A case like this could occur if a function with the sreg intrinsic was
|
|
;; inlined into a kernel where the tid metadata is present, ensure the range is
|
|
;; updated.
|
|
define i32 @test_inlined() {
|
|
; CHECK-LABEL: define i32 @test_inlined(
|
|
; CHECK-SAME: ) #[[ATTR0]] {
|
|
; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
; CHECK-NEXT: ret i32 [[TMP1]]
|
|
;
|
|
%1 = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
ret i32 %1
|
|
}
|
|
|
|
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
|
declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
|
|
declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
|
|
|
|
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
|
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
|
|
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
|
|
|
|
!nvvm.annotations = !{!0, !1, !2}
|
|
!0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3}
|
|
!1 = !{ptr @test_reqntid, !"kernel", i32 1, !"reqntidx", i32 20}
|
|
!2 = !{ptr @test_inlined, !"kernel", i32 1, !"maxntidx", i32 4}
|