This is a follow up of D99010. We didn't consider the live range of shape registers when hoist ldtilecfg. There maybe risks, e.g. we happen to insert it to an invalid range of some registers and get unexpected error. This patch fixes this problem by storing the value to corresponding stack place of ldtilecfg after all its definition immediately. This patch also fix a problem in previous code: If we don't have a ldtilecfg which dominates all AMX instructions, we cannot initialize shapes for other ldtilecfg. There're still some optimization points left. E.g. eliminate unused mov instructions, break the def-use dependency before RA etc. Reviewed By: LuoYuanke, xiangzhangllvm Differential Revision: https://reviews.llvm.org/D99966
371 lines
14 KiB
LLVM
371 lines
14 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
|
|
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
|
|
|
|
define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test1:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movl $buf, %eax
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: movw $8, %dx
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm0
|
|
; CHECK-NEXT: movl $buf+1024, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm1
|
|
; CHECK-NEXT: movl $buf+2048, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
|
; CHECK-NEXT: tdpbssd %tmm1, %tmm0, %tmm2
|
|
; CHECK-NEXT: tilestored %tmm2, (%rax,%rcx)
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: jmp foo # TAILCALL
|
|
%3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
|
%4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
%5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
|
%6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
|
|
call void @llvm.dbg.value(metadata x86_amx %6, metadata !DILocalVariable(name: "1", scope: !2), metadata !DIExpression()), !dbg !3
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
|
|
tail call void @foo()
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test2:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: pushq %rbp
|
|
; CHECK-NEXT: pushq %rbx
|
|
; CHECK-NEXT: subq $72, %rsp
|
|
; CHECK-NEXT: movl %esi, %ebx
|
|
; CHECK-NEXT: movl %edi, %ebp
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %bpl, {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: callq foo
|
|
; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB1_3
|
|
; CHECK-NEXT: # %bb.1: # %if.true
|
|
; CHECK-NEXT: movw $8, %ax
|
|
; CHECK-NEXT: tilezero %tmm0
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: movl $buf+1024, %edx
|
|
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm1
|
|
; CHECK-NEXT: movl $buf+2048, %edx
|
|
; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
|
|
; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0
|
|
; CHECK-NEXT: tilestored %tmm0, (%rdx,%rcx)
|
|
; CHECK-NEXT: jmp .LBB1_2
|
|
; CHECK-NEXT: .LBB1_3: # %if.false
|
|
; CHECK-NEXT: movl $buf, %eax
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: movw $8, %dx
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm3
|
|
; CHECK-NEXT: movl $buf+1024, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm4
|
|
; CHECK-NEXT: movl $buf+2048, %eax
|
|
; CHECK-NEXT: tileloadd (%rax,%rcx), %tmm2
|
|
; CHECK-NEXT: tdpbssd %tmm2, %tmm4, %tmm3
|
|
; CHECK-NEXT: tilestored %tmm3, (%rax,%rcx)
|
|
; CHECK-NEXT: .LBB1_2: # %if.true
|
|
; CHECK-NEXT: addq $72, %rsp
|
|
; CHECK-NEXT: popq %rbx
|
|
; CHECK-NEXT: popq %rbp
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: retq
|
|
call void @foo()
|
|
br i1 undef, label %if.true, label %if.false
|
|
|
|
if.true:
|
|
%t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
|
|
%t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
%t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
|
%t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
|
|
br label %exit
|
|
|
|
if.false:
|
|
%t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
|
|
%t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
%t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
|
|
%t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
|
|
br label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @test3(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test3:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB2_2
|
|
; CHECK-NEXT: # %bb.1: # %if.true
|
|
; CHECK-NEXT: incl %edi
|
|
; CHECK-NEXT: jmp .LBB2_3
|
|
; CHECK-NEXT: .LBB2_2: # %if.false
|
|
; CHECK-NEXT: decl %edi
|
|
; CHECK-NEXT: .LBB2_3: # %exit
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: tilezero %tmm0
|
|
; CHECK-NEXT: movl $buf, %eax
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
br i1 undef, label %if.true, label %if.false
|
|
|
|
if.true:
|
|
%3 = add i16 %0, 1
|
|
br label %exit
|
|
|
|
if.false:
|
|
%4 = sub i16 %0, 1
|
|
br label %exit
|
|
|
|
exit:
|
|
%5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
|
|
%6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @test4(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test4:
|
|
; CHECK: # %bb.0:
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB3_3
|
|
; CHECK-NEXT: # %bb.1: # %if.true
|
|
; CHECK-NEXT: incl %edi
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB3_4
|
|
; CHECK-NEXT: .LBB3_2: # %amx2
|
|
; CHECK-NEXT: movl $32, %eax
|
|
; CHECK-NEXT: movl $buf+1024, %ecx
|
|
; CHECK-NEXT: tileloadd (%rcx,%rax), %tmm0
|
|
; CHECK-NEXT: movl $buf, %ecx
|
|
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rax)
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
; CHECK-NEXT: .LBB3_3: # %if.false
|
|
; CHECK-NEXT: decl %edi
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB3_2
|
|
; CHECK-NEXT: .LBB3_4: # %amx1
|
|
; CHECK-NEXT: tilezero %tmm0
|
|
; CHECK-NEXT: movl $buf, %eax
|
|
; CHECK-NEXT: movl $32, %ecx
|
|
; CHECK-NEXT: tilestored %tmm0, (%rax,%rcx)
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
br i1 undef, label %if.true, label %if.false
|
|
|
|
if.true:
|
|
%3 = add i16 %0, 1
|
|
br i1 undef, label %amx1, label %amx2
|
|
|
|
if.false:
|
|
%4 = sub i16 %0, 1
|
|
br i1 undef, label %amx2, label %amx1
|
|
|
|
amx1:
|
|
%5 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
|
|
%6 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %5, i16 %1)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %5, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %6)
|
|
br label %exit
|
|
|
|
amx2:
|
|
%7 = phi i16 [ %3, %if.true ], [ %4, %if.false ]
|
|
%8 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %7, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %7, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %8)
|
|
br label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @test5(i16 signext %0, i16 signext %1) nounwind {
|
|
; CHECK-LABEL: test5:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: # kill: def $esi killed $esi def $rsi
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movw %si, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %eax, %eax
|
|
; CHECK-NEXT: movl $buf, %r8d
|
|
; CHECK-NEXT: movl $32, %edx
|
|
; CHECK-NEXT: leal -1(%rsi), %ecx
|
|
; CHECK-NEXT: jmp .LBB4_1
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB4_3: # %if.false
|
|
; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1
|
|
; CHECK-NEXT: movl %ecx, %esi
|
|
; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: cmpw $7, %si
|
|
; CHECK-NEXT: jne .LBB4_5
|
|
; CHECK-NEXT: .LBB4_1: # %loop.bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: testb %al, %al
|
|
; CHECK-NEXT: jne .LBB4_3
|
|
; CHECK-NEXT: # %bb.2: # %if.true
|
|
; CHECK-NEXT: # in Loop: Header=BB4_1 Depth=1
|
|
; CHECK-NEXT: tilezero %tmm0
|
|
; CHECK-NEXT: tilestored %tmm0, (%r8,%rdx)
|
|
; CHECK-NEXT: cmpw $7, %si
|
|
; CHECK-NEXT: je .LBB4_1
|
|
; CHECK-NEXT: .LBB4_5: # %exit
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
br label %loop.bb1
|
|
|
|
loop.bb1:
|
|
%2 = phi i16 [ %1, %entry ], [ %5, %loop.bb2 ]
|
|
br i1 undef, label %if.true, label %if.false
|
|
|
|
if.true:
|
|
%3 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %2)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %2, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %3)
|
|
br label %loop.bb2
|
|
|
|
if.false:
|
|
%4 = sub i16 %1, 1
|
|
br label %loop.bb2
|
|
|
|
loop.bb2:
|
|
%5 = phi i16 [ %2, %if.true ], [ %4, %if.false ]
|
|
%6 = icmp eq i16 %5, 7
|
|
br i1 %6, label %loop.bb1, label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
define dso_local void @test6(i16 signext %0) nounwind {
|
|
; CHECK-LABEL: test6:
|
|
; CHECK: # %bb.0: # %entry
|
|
; CHECK-NEXT: # kill: def $edi killed $edi def $rdi
|
|
; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0
|
|
; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: movb %dil, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: xorl %r8d, %r8d
|
|
; CHECK-NEXT: movl $buf, %ecx
|
|
; CHECK-NEXT: movl $32, %edx
|
|
; CHECK-NEXT: xorl %esi, %esi
|
|
; CHECK-NEXT: jmp .LBB5_1
|
|
; CHECK-NEXT: .p2align 4, 0x90
|
|
; CHECK-NEXT: .LBB5_3: # %if.false
|
|
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
|
|
; CHECK-NEXT: decl %esi
|
|
; CHECK-NEXT: .LBB5_4: # %loop.bb2
|
|
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
|
|
; CHECK-NEXT: leal (%rdi,%rsi), %eax
|
|
; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: cmpw $7, %si
|
|
; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp)
|
|
; CHECK-NEXT: tilezero %tmm0
|
|
; CHECK-NEXT: tilestored %tmm0, (%rcx,%rdx)
|
|
; CHECK-NEXT: jne .LBB5_5
|
|
; CHECK-NEXT: .LBB5_1: # %loop.bb1
|
|
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: testb %r8b, %r8b
|
|
; CHECK-NEXT: jne .LBB5_3
|
|
; CHECK-NEXT: # %bb.2: # %if.true
|
|
; CHECK-NEXT: # in Loop: Header=BB5_1 Depth=1
|
|
; CHECK-NEXT: incl %esi
|
|
; CHECK-NEXT: jmp .LBB5_4
|
|
; CHECK-NEXT: .LBB5_5: # %exit
|
|
; CHECK-NEXT: tilerelease
|
|
; CHECK-NEXT: vzeroupper
|
|
; CHECK-NEXT: retq
|
|
entry:
|
|
br label %loop.bb1
|
|
|
|
loop.bb1:
|
|
%1 = phi i16 [ 0, %entry ], [ %4, %loop.bb2 ]
|
|
br i1 undef, label %if.true, label %if.false
|
|
|
|
if.true:
|
|
%2 = add i16 %1, 1
|
|
br label %loop.bb2
|
|
|
|
if.false:
|
|
%3 = sub i16 %1, 1
|
|
br label %loop.bb2
|
|
|
|
loop.bb2:
|
|
%4 = phi i16 [ %2, %if.true ], [ %3, %if.false ]
|
|
%5 = icmp eq i16 %4, 7
|
|
%6 = add i16 %0, %4
|
|
%7 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 %6)
|
|
tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %6, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32, x86_amx %7)
|
|
br i1 %5, label %loop.bb1, label %exit
|
|
|
|
exit:
|
|
ret void
|
|
}
|
|
|
|
|
|
declare dso_local void @foo() nounwind
|
|
declare void @llvm.dbg.value(metadata, metadata, metadata)
|
|
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
|
|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
|
|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
|
|
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
|
|
|
|
!llvm.dbg.cu = !{!0}
|
|
!llvm.module.flags = !{!1}
|
|
|
|
!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !DIFile(filename: "1", directory: "1"))
|
|
!1 = !{i32 2, !"Debug Info Version", i32 3}
|
|
!2 = distinct !DISubprogram(unit: !0)
|
|
!3 = !DILocation(line: 1, column: 1, scope: !2)
|