I noticed that NVPTX will sometimes emit `mad.lo` to multiply by 1, e.g. in https://gcc.godbolt.org/z/4j47Y9W4c. This happens when DAGCombiner operates on the add before the mul, so the imad contraction happens regardless of whether the mul could have been simplified. To fix this, I remove `NVPTXISD::IMAD` and only combine to mad during selection. This allows the default DAGCombiner patterns to simplify the graph without any NVPTX-specific intervention.
43 lines
2.1 KiB
LLVM
43 lines
2.1 KiB
LLVM
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx72 -mcpu=sm_52 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
|
|
; RUN: not llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_50 2>&1 | FileCheck %s --check-prefixes=CHECK-FAILS
|
|
|
|
; RUN: llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-32
|
|
; RUN: llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | FileCheck %s --check-prefixes=CHECK,CHECK-64
|
|
; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -mtriple=nvptx -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
|
|
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mattr=+ptx73 -mcpu=sm_52 | %ptxas-verify %}
|
|
|
|
; CHECK-FAILS: in function test_dynamic_stackalloc{{.*}}: Support for dynamic alloca introduced in PTX ISA version 7.3 and requires target sm_52.
|
|
|
|
; CHECK-LABEL: .visible .func (.param .b32 func_retval0) test_dynamic_stackalloc(
|
|
; CHECK-NOT: __local_depot
|
|
|
|
; CHECK-32: ld.param.u32 %r[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
|
|
; CHECK-32-NEXT: add.s32 %r[[SIZE2:[0-9]]], %r[[SIZE]], 7;
|
|
; CHECK-32-NEXT: and.b32 %r[[SIZE3:[0-9]]], %r[[SIZE2]], -8;
|
|
; CHECK-32-NEXT: alloca.u32 %r[[ALLOCA:[0-9]]], %r[[SIZE3]], 16;
|
|
; CHECK-32-NEXT: cvta.local.u32 %r[[ALLOCA]], %r[[ALLOCA]];
|
|
; CHECK-32-NEXT: { // callseq 0, 0
|
|
; CHECK-32-NEXT: .param .b32 param0;
|
|
; CHECK-32-NEXT: st.param.b32 [param0], %r[[ALLOCA]];
|
|
|
|
; CHECK-64: ld.param.u64 %rd[[SIZE:[0-9]]], [test_dynamic_stackalloc_param_0];
|
|
; CHECK-64-NEXT: add.s64 %rd[[SIZE2:[0-9]]], %rd[[SIZE]], 7;
|
|
; CHECK-64-NEXT: and.b64 %rd[[SIZE3:[0-9]]], %rd[[SIZE2]], -8;
|
|
; CHECK-64-NEXT: alloca.u64 %rd[[ALLOCA:[0-9]]], %rd[[SIZE3]], 16;
|
|
; CHECK-64-NEXT: cvta.local.u64 %rd[[ALLOCA]], %rd[[ALLOCA]];
|
|
; CHECK-64-NEXT: { // callseq 0, 0
|
|
; CHECK-64-NEXT: .param .b64 param0;
|
|
; CHECK-64-NEXT: st.param.b64 [param0], %rd[[ALLOCA]];
|
|
|
|
; CHECK-NEXT: .param .b32 retval0;
|
|
; CHECK-NEXT: call.uni (retval0),
|
|
; CHECK-NEXT: bar,
|
|
|
|
define i32 @test_dynamic_stackalloc(i64 %n) {
|
|
%alloca = alloca i8, i64 %n, align 16
|
|
%call = call i32 @bar(ptr %alloca)
|
|
ret i32 %call
|
|
}
|
|
|
|
declare i32 @bar(ptr)
|