Files
clang-p2996/llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
James Y Knight d6f9278ae9 [X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (#74275)
In late 2021, both Intel and AMD finally documented that every
AVX-capable CPU has always been guaranteed to execute aligned 16-byte
loads/stores atomically, and further, guaranteed that all future CPUs
with AVX will do so as well.

Therefore, we may use normal SSE 128-bit load/store instructions to
implement atomics, if AVX is enabled.

Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity:

> Processors that report [AVX] extend the atomicity for cacheable,
> naturally-aligned single loads or stores from a quadword to a double
> quadword.

Per Intel's SDM:

> Processors that enumerate support for Intel(R) AVX guarantee that the
> 16-byte memory operations performed by the following instructions will
> always be carried out atomically:
> - MOVAPD, MOVAPS, and MOVDQA.
> - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128.
> - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with
>   EVEX.128 and k0 (masking disabled).

This was also confirmed to be true for Zhaoxin CPUs with AVX, in
https://gcc.gnu.org/PR104688
2024-05-16 18:24:23 -04:00

115 lines
4.2 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2,cx16 | FileCheck %s --check-prefixes=X64-SSE
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx,cx16 | FileCheck %s --check-prefixes=X64-AVX
; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f,cx16 | FileCheck %s --check-prefixes=X64-AVX
; Codegen of fp128 without cx16 is tested in atomic-nocx16.ll
define void @store_fp128(ptr %fptr, fp128 %v) {
; X64-SSE-LABEL: store_fp128:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pushq %rbx
; X64-SSE-NEXT: .cfi_def_cfa_offset 16
; X64-SSE-NEXT: .cfi_offset %rbx, -16
; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-SSE-NEXT: movq (%rdi), %rax
; X64-SSE-NEXT: movq 8(%rdi), %rdx
; X64-SSE-NEXT: .p2align 4, 0x90
; X64-SSE-NEXT: .LBB0_1: # %atomicrmw.start
; X64-SSE-NEXT: # =>This Inner Loop Header: Depth=1
; X64-SSE-NEXT: lock cmpxchg16b (%rdi)
; X64-SSE-NEXT: jne .LBB0_1
; X64-SSE-NEXT: # %bb.2: # %atomicrmw.end
; X64-SSE-NEXT: popq %rbx
; X64-SSE-NEXT: .cfi_def_cfa_offset 8
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: store_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
; X64-AVX-NEXT: retq
store atomic fp128 %v, ptr %fptr unordered, align 16
ret void
}
define fp128 @load_fp128(ptr %fptr) {
; X64-SSE-LABEL: load_fp128:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pushq %rbx
; X64-SSE-NEXT: .cfi_def_cfa_offset 16
; X64-SSE-NEXT: .cfi_offset %rbx, -16
; X64-SSE-NEXT: xorl %eax, %eax
; X64-SSE-NEXT: xorl %edx, %edx
; X64-SSE-NEXT: xorl %ecx, %ecx
; X64-SSE-NEXT: xorl %ebx, %ebx
; X64-SSE-NEXT: lock cmpxchg16b (%rdi)
; X64-SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-SSE-NEXT: popq %rbx
; X64-SSE-NEXT: .cfi_def_cfa_offset 8
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: load_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; X64-AVX-NEXT: retq
%v = load atomic fp128, ptr %fptr unordered, align 16
ret fp128 %v
}
define fp128 @exchange_fp128(ptr %fptr, fp128 %x) {
; X64-SSE-LABEL: exchange_fp128:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: pushq %rbx
; X64-SSE-NEXT: .cfi_def_cfa_offset 16
; X64-SSE-NEXT: .cfi_offset %rbx, -16
; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-SSE-NEXT: movq (%rdi), %rax
; X64-SSE-NEXT: movq 8(%rdi), %rdx
; X64-SSE-NEXT: .p2align 4, 0x90
; X64-SSE-NEXT: .LBB2_1: # %atomicrmw.start
; X64-SSE-NEXT: # =>This Inner Loop Header: Depth=1
; X64-SSE-NEXT: lock cmpxchg16b (%rdi)
; X64-SSE-NEXT: jne .LBB2_1
; X64-SSE-NEXT: # %bb.2: # %atomicrmw.end
; X64-SSE-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; X64-SSE-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0
; X64-SSE-NEXT: popq %rbx
; X64-SSE-NEXT: .cfi_def_cfa_offset 8
; X64-SSE-NEXT: retq
;
; X64-AVX-LABEL: exchange_fp128:
; X64-AVX: # %bb.0:
; X64-AVX-NEXT: pushq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
; X64-AVX-NEXT: .cfi_offset %rbx, -16
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
; X64-AVX-NEXT: movq (%rdi), %rax
; X64-AVX-NEXT: movq 8(%rdi), %rdx
; X64-AVX-NEXT: .p2align 4, 0x90
; X64-AVX-NEXT: .LBB2_1: # %atomicrmw.start
; X64-AVX-NEXT: # =>This Inner Loop Header: Depth=1
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
; X64-AVX-NEXT: jne .LBB2_1
; X64-AVX-NEXT: # %bb.2: # %atomicrmw.end
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
; X64-AVX-NEXT: popq %rbx
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
; X64-AVX-NEXT: retq
%v = atomicrmw xchg ptr %fptr, fp128 %x monotonic, align 16
ret fp128 %v
}