Scalarization can expose optimization opportunities for the individual elements of a vector, and can therefore be beneficial on targets like GPUs that tend to operate on scalars anyway. However, notably with 16-bit operations it is often beneficial to keep <2 x i16 / half> vectors around since there are packed instructions for those. Refactor the code to operate on "fragments" of split vectors. The fragments are usually scalars, but may themselves be smaller vectors when the scalarizer-min-bits option is used. If the split is uneven, the last fragment is a shorter remainder. This is almost NFC when the new option is unused, but it happens to clean up some code in the fully scalarized case as well. Differential Revision: https://reviews.llvm.org/D149842
28 lines
1.1 KiB
LLVM
28 lines
1.1 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt %s -passes='scalarizer,dce' -S -scalarize-load-store -o - | FileCheck %s
|
|
|
|
; This used to crash because the same (pointer) value was scattered by
|
|
; different amounts.
|
|
|
|
define void @test1(ptr %p) {
|
|
; CHECK-LABEL: @test1(
|
|
; CHECK-NEXT: [[P_I11:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1
|
|
; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[P]], i32 2
|
|
; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[P]], i32 3
|
|
; CHECK-NEXT: store i32 0, ptr [[P]], align 8
|
|
; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[P]], i32 1
|
|
; CHECK-NEXT: store i32 0, ptr [[P_I1]], align 4
|
|
; CHECK-NEXT: store i32 0, ptr [[P]], align 16
|
|
; CHECK-NEXT: store i32 0, ptr [[P_I1]], align 4
|
|
; CHECK-NEXT: store i32 0, ptr [[P_I2]], align 8
|
|
; CHECK-NEXT: store i32 0, ptr [[P_I3]], align 4
|
|
; CHECK-NEXT: store i16 0, ptr [[P]], align 4
|
|
; CHECK-NEXT: store i16 0, ptr [[P_I11]], align 2
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
store <2 x i32> zeroinitializer, ptr %p
|
|
store <4 x i32> zeroinitializer, ptr %p
|
|
store <2 x i16> zeroinitializer, ptr %p
|
|
ret void
|
|
}
|