Files
clang-p2996/llvm/test/CodeGen/X86/AMX/amx-configO2toO0.ll
Luo, Yuanke 496156ac57 [X86][AMX] Multiple configure for AMX register.
The previous solution depends on variable name to record the shape
information. However it is not reliable, because in release build
compiler would not set the variable name. It can be accomplished with an
additional option `fno-discard-value-names`, but it is not acceptable
for users.
This patch is to preconfigure the tile register with machine
instruction. It follow the same way what sigle configure does. In the
future we can fall back to multiple configure when single configure
fails due to the shape dependency issue.
The algorithm to configure the tile register is simple in the patch. We
may improve it in the future. It configure tile register based on basic
block. Compiler would spill the tile register if it live out the basic
block. After the configure there should be no spill across tile
confgiure in the register alloction. Just like fast register allocation
the algorithm walk the instruction in reverse order. When the shape
dependency doesn't meet, it insert ldtilecfg after the last instruction
that define the shape.
In post configuration compiler also walk the basic block to collect the
physical tile register number and generate instruction to fill the stack
slot for the correponding shape information.
TODO: There is some following work in D125602. The risk is modifying the
fast RA may cause regression as fast RA is usded for different targets.
We may create an independent RA for tile register.

Differential Revision: https://reviews.llvm.org/D125075
2022-05-24 13:18:42 +08:00

185 lines
8.8 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
@buf = dso_local global [1024 x i8] zeroinitializer, align 16
@buf2 = dso_local global [1024 x i8] zeroinitializer, align 16
define dso_local void @test_api(i32 %cond, i16 signext %row, i16 signext %col) nounwind {
; AVX512-LABEL: test_api:
; AVX512: # %bb.0: # %entry
; AVX512-NEXT: pushq %rbp
; AVX512-NEXT: movq %rsp, %rbp
; AVX512-NEXT: andq $-1024, %rsp # imm = 0xFC00
; AVX512-NEXT: subq $8192, %rsp # imm = 0x2000
; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movb $1, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %dx, %ax
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512-NEXT: movw %si, %ax
; AVX512-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: leaq {{[0-9]+}}(%rsp), %rax
; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: cmpl $0, %edi
; AVX512-NEXT: je .LBB0_2
; AVX512-NEXT: # %bb.1: # %if.then
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; AVX512-NEXT: movl $buf, %r9d
; AVX512-NEXT: movl $32, %r10d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
; AVX512-NEXT: movl $64, %r9d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: tilestored %tmm0, (%r8,%r9)
; AVX512-NEXT: movl $buf, %r8d
; AVX512-NEXT: movl $32, %r9d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: # implicit-def: $sil
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0
; AVX512-NEXT: movl $64, %r8d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
; AVX512-NEXT: movl $buf, %esi
; AVX512-NEXT: movl $32, %edi
; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0
; AVX512-NEXT: movl $64, %esi
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
; AVX512-NEXT: jmp .LBB0_3
; AVX512-NEXT: .LBB0_2: # %if.else
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; AVX512-NEXT: movl $buf2, %r9d
; AVX512-NEXT: movl $32, %r10d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %si, {{[0-9]+}}(%rsp)
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm0
; AVX512-NEXT: movl $64, %r9d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: tilestored %tmm0, (%r8,%r9)
; AVX512-NEXT: movl $buf2, %r8d
; AVX512-NEXT: movl $32, %r9d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: # implicit-def: $sil
; AVX512-NEXT: movb %sil, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; AVX512-NEXT: tileloadd (%r8,%r9), %tmm0
; AVX512-NEXT: movl $64, %r8d
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: tilestored %tmm0, (%rdi,%r8)
; AVX512-NEXT: movl $buf2, %esi
; AVX512-NEXT: movl $32, %edi
; AVX512-NEXT: tileloadd (%rsi,%rdi), %tmm0
; AVX512-NEXT: movl $64, %esi
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
; AVX512-NEXT: .LBB0_3: # %if.end
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %ax # 2-byte Reload
; AVX512-NEXT: movw {{[-0-9]+}}(%r{{[sb]}}p), %cx # 2-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
; AVX512-NEXT: movl $64, %r10d
; AVX512-NEXT: movw $8, %di
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; AVX512-NEXT: tileloadd (%r8,%r10), %tmm0
; AVX512-NEXT: movabsq $64, %r8
; AVX512-NEXT: tilestored %tmm0, 1024(%rsp,%r8) # 1024-byte Folded Spill
; AVX512-NEXT: movl $64, %r10d
; AVX512-NEXT: movw $8, %r8w
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %di, {{[0-9]+}}(%rsp)
; AVX512-NEXT: # implicit-def: $al
; AVX512-NEXT: movb %al, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: # implicit-def: $r8b
; AVX512-NEXT: movb %r8b, {{[0-9]+}}(%rsp)
; AVX512-NEXT: movw %cx, {{[0-9]+}}(%rsp)
; AVX512-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
; AVX512-NEXT: tileloadd (%r9,%r10), %tmm2
; AVX512-NEXT: movl $64, %r8d
; AVX512-NEXT: tileloadd (%rsi,%r8), %tmm0
; AVX512-NEXT: movw $8, %si
; AVX512-NEXT: movabsq $64, %r8
; AVX512-NEXT: tileloadd 1024(%rsp,%r8), %tmm1 # 1024-byte Folded Reload
; AVX512-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
; AVX512-NEXT: movl $64, %esi
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
; AVX512-NEXT: movl $64, %esi
; AVX512-NEXT: tileloadd (%rdx,%rsi), %tmm0
; AVX512-NEXT: movl $buf, %edx
; AVX512-NEXT: movl $32, %esi
; AVX512-NEXT: tilestored %tmm0, (%rdx,%rsi)
; AVX512-NEXT: movq %rbp, %rsp
; AVX512-NEXT: popq %rbp
; AVX512-NEXT: tilerelease
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
entry:
%tobool.not = icmp eq i32 %cond, 0
br i1 %tobool.not, label %if.else, label %if.then
if.then: ; preds = %entry
%0 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
%1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
%2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32)
br label %if.end
if.else: ; preds = %entry
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 8, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf2, i64 0, i64 0), i64 32)
br label %if.end
if.end: ; preds = %if.else, %if.then
%a.sroa.1094.0.in = phi x86_amx [ %3, %if.else ], [ %0, %if.then ]
%b.sroa.1069.0.in = phi x86_amx [ %4, %if.else ], [ %1, %if.then ]
%c.sroa.1044.0.in = phi x86_amx [ %5, %if.else ], [ %2, %if.then ]
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %row, i16 %col, i16 8, x86_amx %c.sroa.1044.0.in, x86_amx %a.sroa.1094.0.in, x86_amx %b.sroa.1069.0.in)
tail call void @llvm.x86.tilestored64.internal(i16 %row, i16 %col, i8* getelementptr inbounds ([1024 x i8], [1024 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
ret void
}
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)