This commit adds the -lower-buffer-fat-pointers pass, which is
applicable to all AMDGCN compilations.
The purpose of this pass is to remove the type `ptr addrspace(7)` from
incoming IR. This must be done at the LLVM IR level because `ptr
addrspace(7)`, as a 160-bit primitive type, cannot be correctly handled
by SelectionDAG.
The detailed operation of the pass is described in comments, but, in
summary, the removal proceeds by:
1. Rewriting loads and stores of ptr addrspace(7) to loads and stores of
i160 (including vectors and aggregates). This is needed because the
in-register representation of these pointers will stop matching their
in-memory representation in step 2, and so ptrtoint/inttoptr operations
are used to preserve the expected memory layout
2. Mutating the IR to replace all occurrences of `ptr addrspace(7)` with
the type `{ptr addrspace(8), ptr addrspace(6) }`, which makes the two
parts of a buffer fat pointer (the 128-bit address space 8 resource and
the 32-bit address space 6 offset) visible in the IR. This also impacts
the argument and return types of functions.
3. *Splitting* the resource and offset parts. All instructions that
produce or consume buffer fat pointers (like GEP or load) are rewritten
to produce or consume the resource and offset parts separately. For
example, GEP updates the offset part of the result and a load uses the
resource and offset parts to populate the relevant
llvm.amdgcn.raw.ptr.buffer.load intrinsic call.
At the end of this process, the original mutated instructions are
replaced by their new split counterparts, ensuring no invalidly-typed IR
escapes this pass. (For operations like call, where the struct form is
needed, insertelement operations are inserted).
Compared to LGC's PatchBufferOp (
32cda89776/lgc/patch/PatchBufferOp.cpp
): this pass
- Also handles vectors of ptr addrspace(7)s
- Also handles function boundaries
- Includes the same uniform buffer optimization for loops and
conditionals
- Does *not* handle memcpy() and friends (this is future work)
- Does *not* break up large loads and stores into smaller parts. This
should be handled by extending the legalization
of *.buffer.{load,store} to handle larger types by producing multiple
instructions (the same way ordinary LOAD and STORE are legalized). That
work is planned for a followup commit.
- Does *not* have special logic for handling divergent buffer
descriptors. The logic in LGC is, as far as I can tell, incorrect in
general, and, per discussions with @nhaehnle, isn't widely used.
Therefore, divergent descriptors are handled with waterfall loops later
in legalization.
As a final matter, this commit updates atomic expansion to treat buffer
operations analogously to global ones.
(One question for reviewers: is the new pass is the right place? Should
it be later in the pipeline?)
Differential Revision: https://reviews.llvm.org/D158463
459 lines
21 KiB
LLVM
459 lines
21 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
|
|
; RUN: opt -S -mcpu=gfx900 -amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
|
|
; RUN: opt -S -mcpu=gfx900 -passes=amdgpu-lower-buffer-fat-pointers < %s | FileCheck %s
|
|
|
|
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
|
|
target triple = "amdgcn--"
|
|
|
|
;; This should optimize to just the offset part
|
|
define float @sum(ptr addrspace(8) %buf, i32 %len) {
|
|
; CHECK-LABEL: define float @sum
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0:[0-9]+]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[PTR]] = add i32 [[PTR_PREV_OFF]], 4
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop ], [ 0, %entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%ptr = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
br i1 %test, label %loop, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; But this should not
|
|
define float @sum_integer_ops(ptr addrspace(8) %buf, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_integer_ops
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP]] ], [ [[BUF]], [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[PTR_PREV_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[PTR_PREV_RSRC]] to i160
|
|
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i160 [[PTR_PREV_INT_RSRC]], 32
|
|
; CHECK-NEXT: [[PTR_PREV_INT_OFF:%.*]] = zext i32 [[PTR_PREV_OFF]] to i160
|
|
; CHECK-NEXT: [[PTR_PREV_INT:%.*]] = or i160 [[TMP0]], [[PTR_PREV_INT_OFF]]
|
|
; CHECK-NEXT: [[PTR_INT:%.*]] = add i160 [[PTR_PREV_INT]], 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[PTR_INT]], 32
|
|
; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128
|
|
; CHECK-NEXT: [[PTR_RSRC]] = inttoptr i128 [[TMP2]] to ptr addrspace(8)
|
|
; CHECK-NEXT: [[PTR_OFF]] = trunc i160 [[PTR_INT]] to i32
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop ], [ 0, %entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%ptr.prev.int = ptrtoint ptr addrspace(7) %ptr.prev to i160
|
|
%ptr.int = add i160 %ptr.prev.int, 4
|
|
%ptr = inttoptr i160 %ptr.int to ptr addrspace(7)
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
br i1 %test, label %loop, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; Should go to offsets only
|
|
define float @sum_2d(ptr addrspace(8) %buf, i32 %ii, i32 %jj) {
|
|
; CHECK-LABEL: define float @sum_2d
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[II:%.*]], i32 [[JJ:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP1_ENTRY:%.*]]
|
|
; CHECK: loop1.entry:
|
|
; CHECK-NEXT: [[SUM1_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP1_EXIT:%.*]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP1_EXIT]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR1_PREV_OFF:%.*]] = phi i32 [ [[PTR1:%.*]], [[LOOP1_EXIT]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: br label [[LOOP2:%.*]]
|
|
; CHECK: loop2:
|
|
; CHECK-NEXT: [[SUM2_PREV:%.*]] = phi float [ [[SUM]], [[LOOP2]] ], [ [[SUM1_PREV]], [[LOOP1_ENTRY]] ]
|
|
; CHECK-NEXT: [[J:%.*]] = phi i32 [ [[J_NEXT:%.*]], [[LOOP2]] ], [ 0, [[LOOP1_ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR2_PREV_OFF:%.*]] = phi i32 [ [[PTR2:%.*]], [[LOOP2]] ], [ [[PTR1_PREV_OFF]], [[LOOP1_ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 [[PTR2_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM2_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[PTR2]] = add i32 [[PTR2_PREV_OFF]], 4
|
|
; CHECK-NEXT: [[J_NEXT]] = add i32 [[J]], 1
|
|
; CHECK-NEXT: [[TEST2:%.*]] = icmp ult i32 [[J_NEXT]], [[JJ]]
|
|
; CHECK-NEXT: br i1 [[TEST2]], label [[LOOP2]], label [[LOOP1_EXIT]]
|
|
; CHECK: loop1.exit:
|
|
; CHECK-NEXT: [[PTR1]] = add i32 [[PTR2]], 4
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST1:%.*]] = icmp ult i32 [[I_NEXT]], [[II]]
|
|
; CHECK-NEXT: br i1 [[TEST1]], label [[LOOP1_ENTRY]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop1.entry
|
|
loop1.entry:
|
|
%sum1.prev = phi float [ %sum, %loop1.exit ], [ 0.0, %entry ]
|
|
%ptr1.prev = phi ptr addrspace(7) [ %ptr1, %loop1.exit ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop1.exit ], [ 0, %entry ]
|
|
|
|
br label %loop2
|
|
loop2:
|
|
%sum2.prev = phi float [ %sum, %loop2 ], [ %sum1.prev, %loop1.entry ]
|
|
%ptr2.prev = phi ptr addrspace(7) [ %ptr2, %loop2 ], [ %ptr1.prev, %loop1.entry ]
|
|
%j = phi i32 [ %j.next, %loop2 ], [ 0, %loop1.entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr2.prev
|
|
%sum = fadd float %sum2.prev, %val
|
|
|
|
%ptr2 = getelementptr float, ptr addrspace(7) %ptr2.prev, i32 1
|
|
%j.next = add i32 %j, 1
|
|
%test2 = icmp ult i32 %j.next, %jj
|
|
|
|
br i1 %test2, label %loop2, label %loop1.exit
|
|
loop1.exit:
|
|
%ptr1 = getelementptr float, ptr addrspace(7) %ptr2, i32 1
|
|
%i.next = add i32 %i, 1
|
|
%test1 = icmp ult i32 %i.next, %ii
|
|
br i1 %test1, label %loop1.entry, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; This should optimize to just the offset parts since all the arguments to the
|
|
;; select point to the same buffer.
|
|
define float @sum_jump_on_negative(ptr addrspace(8) %buf, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_jump_on_negative
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[SKIP_NEXT:%.*]] = fcmp olt float [[VAL]], 0.000000e+00
|
|
; CHECK-NEXT: [[SMALL_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 4
|
|
; CHECK-NEXT: [[LARGE_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 8
|
|
; CHECK-NEXT: [[PTR_OFF]] = select i1 [[SKIP_NEXT]], i32 [[LARGE_JUMP]], i32 [[SMALL_JUMP]]
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop ], [ 0, %entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%skip.next = fcmp olt float %val, 0.0
|
|
%small.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1
|
|
%large.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 2
|
|
%ptr = select i1 %skip.next, ptr addrspace(7) %large.jump, ptr addrspace(7) %small.jump
|
|
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
br i1 %test, label %loop, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
define float @sum_jump_on_negative_with_phi(ptr addrspace(8) %buf, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_jump_on_negative_with_phi
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP_EXIT:%.*]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: [[SKIP_NEXT:%.*]] = fcmp olt float [[VAL]], 0.000000e+00
|
|
; CHECK-NEXT: br i1 [[SKIP_NEXT]], label [[THEN:%.*]], label [[ELSE:%.*]]
|
|
; CHECK: then:
|
|
; CHECK-NEXT: [[LARGE_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 8
|
|
; CHECK-NEXT: br label [[LOOP_EXIT]]
|
|
; CHECK: else:
|
|
; CHECK-NEXT: [[SMALL_JUMP:%.*]] = add i32 [[PTR_PREV_OFF]], 4
|
|
; CHECK-NEXT: br label [[LOOP_EXIT]]
|
|
; CHECK: loop.exit:
|
|
; CHECK-NEXT: [[PTR_OFF]] = phi i32 [ [[LARGE_JUMP]], [[THEN]] ], [ [[SMALL_JUMP]], [[ELSE]] ]
|
|
; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop.exit ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop.exit ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop.exit ], [ 0, %entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
|
|
%skip.next = fcmp olt float %val, 0.0
|
|
br i1 %skip.next, label %then, label %else
|
|
then:
|
|
%large.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 2
|
|
br label %loop.exit
|
|
else:
|
|
%small.jump = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1
|
|
br label %loop.exit
|
|
loop.exit:
|
|
%ptr = phi ptr addrspace(7) [ %large.jump, %then ], [ %small.jump, %else ]
|
|
br i1 %test, label %loop, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; But this has a shifting resource part.
|
|
define float @sum_new_buffer_on_negative(ptr addrspace(8) %buf1, ptr addrspace(8) %buf2, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_new_buffer_on_negative
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF1:%.*]], ptr addrspace(8) [[BUF2:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP]] ], [ [[BUF1]], [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[HOP:%.*]] = fcmp olt float [[VAL]], 0.000000e+00
|
|
; CHECK-NEXT: [[THIS_NEXT:%.*]] = add i32 [[PTR_PREV_OFF]], 4
|
|
; CHECK-NEXT: [[PTR_RSRC]] = select i1 [[HOP]], ptr addrspace(8) [[PTR_PREV_RSRC]], ptr addrspace(8) [[BUF2]]
|
|
; CHECK-NEXT: [[PTR_OFF]] = select i1 [[HOP]], i32 [[THIS_NEXT]], i32 0
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf1 to ptr addrspace(7)
|
|
%start2 = addrspacecast ptr addrspace(8) %buf2 to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop ], [ 0, %entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%hop = fcmp olt float %val, 0.0
|
|
%this.next = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1
|
|
%ptr = select i1 %hop, ptr addrspace(7) %this.next, ptr addrspace(7) %start2
|
|
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
br i1 %test, label %loop, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; As does this.
|
|
define float @sum_new_buffer_on_negative_with_phi(ptr addrspace(8) %buf1, ptr addrspace(8) %buf2, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_new_buffer_on_negative_with_phi
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF1:%.*]], ptr addrspace(8) [[BUF2:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP_EXIT:%.*]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP_EXIT]] ], [ [[BUF1]], [[ENTRY]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP_EXIT]] ], [ 0, [[ENTRY]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: [[HOP:%.*]] = fcmp olt float [[VAL]], 0.000000e+00
|
|
; CHECK-NEXT: br i1 [[HOP]], label [[THEN:%.*]], label [[LOOP_EXIT]]
|
|
; CHECK: then:
|
|
; CHECK-NEXT: [[THIS_NEXT:%.*]] = add i32 [[PTR_PREV_OFF]], 4
|
|
; CHECK-NEXT: br label [[LOOP_EXIT]]
|
|
; CHECK: loop.exit:
|
|
; CHECK-NEXT: [[PTR_RSRC]] = phi ptr addrspace(8) [ [[PTR_PREV_RSRC]], [[THEN]] ], [ [[BUF2]], [[LOOP]] ]
|
|
; CHECK-NEXT: [[PTR_OFF]] = phi i32 [ [[THIS_NEXT]], [[THEN]] ], [ 0, [[LOOP]] ]
|
|
; CHECK-NEXT: br i1 [[TEST]], label [[LOOP]], label [[EXIT:%.*]]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf1 to ptr addrspace(7)
|
|
%start2 = addrspacecast ptr addrspace(8) %buf2 to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop.exit ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop.exit ], [ %start, %entry ]
|
|
%i = phi i32 [ %i.next, %loop.exit ], [ 0, %entry ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
%hop = fcmp olt float %val, 0.0
|
|
br i1 %hop, label %then, label %loop.exit
|
|
then:
|
|
%this.next = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1
|
|
br label %loop.exit
|
|
loop.exit:
|
|
%ptr = phi ptr addrspace(7) [ %this.next, %then ], [ %start2, %loop ]
|
|
br i1 %test, label %loop, label %exit
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; Test that the uniform buffer descriptor optimization works correctly for phi
|
|
;; nodes that repeat the same predecessor multiple times.
|
|
define float @sum_duplicate_preds(ptr addrspace(8) %buf, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_duplicate_preds
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ [[SUM]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ], [ [[I_NEXT]], [[LOOP]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ], [ [[PTR]], [[LOOP]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[PTR]] = add i32 [[PTR_PREV_OFF]], 4
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: [[TEST_EXT:%.*]] = zext i1 [[TEST]] to i32
|
|
; CHECK-NEXT: switch i32 [[TEST_EXT]], label [[LOOP]] [
|
|
; CHECK-NEXT: i32 1, label [[LOOP]]
|
|
; CHECK-NEXT: i32 0, label [[EXIT:%.*]]
|
|
; CHECK-NEXT: ]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop ], [ %sum, %loop ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ], [ %ptr, %loop ]
|
|
%i = phi i32 [ %i.next, %loop ], [ 0, %entry ], [ %i.next, %loop ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%ptr = getelementptr float, ptr addrspace(7) %ptr.prev, i32 1
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
%test.ext = zext i1 %test to i32
|
|
switch i32 %test.ext, label %loop [
|
|
i32 1, label %loop
|
|
i32 0, label %exit
|
|
]
|
|
exit:
|
|
ret float %sum
|
|
}
|
|
|
|
;; And similirly check the "might not be uniform" case.
|
|
define float @sum_integer_ops_duplicate_preds(ptr addrspace(8) %buf, i32 %len) {
|
|
; CHECK-LABEL: define float @sum_integer_ops_duplicate_preds
|
|
; CHECK-SAME: (ptr addrspace(8) [[BUF:%.*]], i32 [[LEN:%.*]]) #[[ATTR0]] {
|
|
; CHECK-NEXT: entry:
|
|
; CHECK-NEXT: br label [[LOOP:%.*]]
|
|
; CHECK: loop:
|
|
; CHECK-NEXT: [[SUM_PREV:%.*]] = phi float [ [[SUM:%.*]], [[LOOP]] ], [ [[SUM]], [[LOOP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
|
|
; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[I_NEXT:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ], [ [[I_NEXT]], [[LOOP]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_RSRC:%.*]] = phi ptr addrspace(8) [ [[PTR_RSRC:%.*]], [[LOOP]] ], [ [[BUF]], [[ENTRY]] ], [ [[PTR_RSRC]], [[LOOP]] ]
|
|
; CHECK-NEXT: [[PTR_PREV_OFF:%.*]] = phi i32 [ [[PTR_OFF:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ], [ [[PTR_OFF]], [[LOOP]] ]
|
|
; CHECK-NEXT: [[VAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[PTR_PREV_RSRC]], i32 [[PTR_PREV_OFF]], i32 0, i32 0)
|
|
; CHECK-NEXT: [[SUM]] = fadd float [[SUM_PREV]], [[VAL]]
|
|
; CHECK-NEXT: [[PTR_PREV_INT_RSRC:%.*]] = ptrtoint ptr addrspace(8) [[PTR_PREV_RSRC]] to i160
|
|
; CHECK-NEXT: [[TMP0:%.*]] = shl nuw i160 [[PTR_PREV_INT_RSRC]], 32
|
|
; CHECK-NEXT: [[PTR_PREV_INT_OFF:%.*]] = zext i32 [[PTR_PREV_OFF]] to i160
|
|
; CHECK-NEXT: [[PTR_PREV_INT:%.*]] = or i160 [[TMP0]], [[PTR_PREV_INT_OFF]]
|
|
; CHECK-NEXT: [[PTR_INT:%.*]] = add i160 [[PTR_PREV_INT]], 4
|
|
; CHECK-NEXT: [[TMP1:%.*]] = lshr i160 [[PTR_INT]], 32
|
|
; CHECK-NEXT: [[TMP2:%.*]] = trunc i160 [[TMP1]] to i128
|
|
; CHECK-NEXT: [[PTR_RSRC]] = inttoptr i128 [[TMP2]] to ptr addrspace(8)
|
|
; CHECK-NEXT: [[PTR_OFF]] = trunc i160 [[PTR_INT]] to i32
|
|
; CHECK-NEXT: [[I_NEXT]] = add i32 [[I]], 1
|
|
; CHECK-NEXT: [[TEST:%.*]] = icmp ult i32 [[I_NEXT]], [[LEN]]
|
|
; CHECK-NEXT: [[TEST_EXT:%.*]] = zext i1 [[TEST]] to i32
|
|
; CHECK-NEXT: switch i32 [[TEST_EXT]], label [[LOOP]] [
|
|
; CHECK-NEXT: i32 1, label [[LOOP]]
|
|
; CHECK-NEXT: i32 0, label [[EXIT:%.*]]
|
|
; CHECK-NEXT: ]
|
|
; CHECK: exit:
|
|
; CHECK-NEXT: ret float [[SUM]]
|
|
;
|
|
entry:
|
|
%start = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7)
|
|
br label %loop
|
|
loop:
|
|
%sum.prev = phi float [ %sum, %loop ], [ %sum, %loop ], [ 0.0, %entry ]
|
|
%ptr.prev = phi ptr addrspace(7) [ %ptr, %loop ], [ %start, %entry ], [ %ptr, %loop ]
|
|
%i = phi i32 [ %i.next, %loop ], [ 0, %entry ], [ %i.next, %loop ]
|
|
|
|
%val = load float, ptr addrspace(7) %ptr.prev
|
|
%sum = fadd float %sum.prev, %val
|
|
|
|
%ptr.prev.int = ptrtoint ptr addrspace(7) %ptr.prev to i160
|
|
%ptr.int = add i160 %ptr.prev.int, 4
|
|
%ptr = inttoptr i160 %ptr.int to ptr addrspace(7)
|
|
%i.next = add i32 %i, 1
|
|
%test = icmp ult i32 %i.next, %len
|
|
%test.ext = zext i1 %test to i32
|
|
switch i32 %test.ext, label %loop [
|
|
i32 1, label %loop
|
|
i32 0, label %exit
|
|
]
|
|
exit:
|
|
ret float %sum
|
|
}
|