[libc] Efficiently implement aligned_alloc for AMDGPU (#146585)

Summary: This patch uses the actual allocator interface to implement `aligned_alloc`. We do this by simply rounding up the amount allocated. Because of how index calculation works, any offset within an allocated pointer will still map to the same chunk, so we can just adjust internally and it will free all the same.
2025-07-02 09:25:57 -05:00
parent bca79ec0d2
commit 24828c8c45
5 changed files with 81 additions and 9 deletions
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -138,6 +138,11 @@ void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
    s[i] = c;
 }

+// Indicates that the provided value is a power of two.
+static inline constexpr bool is_pow2(uint64_t x) {
+  return x && (x & (x - 1)) == 0;
+}
+
 } // namespace impl

 /// A slab allocator used to hand out identically sized slabs of memory.
@@ -572,5 +577,27 @@ void *reallocate(void *ptr, uint64_t size) {
  return new_ptr;
 }

+void *aligned_allocate(uint32_t alignment, uint64_t size) {
+  // All alignment values must be a non-zero power of two.
+  if (!impl::is_pow2(alignment))
+    return nullptr;
+
+  // If the requested alignment is less than what we already provide this is
+  // just a normal allocation.
+  if (alignment < MIN_ALIGNMENT + 1)
+    return gpu::allocate(size);
+
+  // We can't handle alignments greater than 2MiB so we simply fail.
+  if (alignment > SLAB_ALIGNMENT + 1)
+    return nullptr;
+
+  // Trying to handle allocation internally would break the assumption that each
+  // chunk is identical to eachother. Allocate enough memory with worst-case
+  // alignment and then round up. The index logic will round down properly.
+  uint64_t rounded = size + alignment - 1;
+  void *ptr = gpu::allocate(rounded);
+  return __builtin_align_up(ptr, alignment);
+}
+
 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
--- a/libc/src/__support/GPU/allocator.h
+++ b/libc/src/__support/GPU/allocator.h
@@ -18,6 +18,7 @@ namespace gpu {
 void *allocate(uint64_t size);
 void deallocate(void *ptr);
 void *reallocate(void *ptr, uint64_t size);
+void *aligned_allocate(uint32_t alignment, uint64_t size);

 } // namespace gpu
 } // namespace LIBC_NAMESPACE_DECL
--- a/libc/src/stdlib/gpu/aligned_alloc.cpp
+++ b/libc/src/stdlib/gpu/aligned_alloc.cpp
@@ -15,15 +15,15 @@
 namespace LIBC_NAMESPACE_DECL {

 LLVM_LIBC_FUNCTION(void *, aligned_alloc, (size_t alignment, size_t size)) {
-  if ((alignment & -alignment) != alignment)
-    return nullptr;
-
-  void *ptr = gpu::allocate(size);
-  if ((reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)) != 0) {
-    gpu::deallocate(ptr);
-    return nullptr;
-  }
-  return ptr;
+  // FIXME: NVIDIA targets currently use the built-in 'malloc' which we cannot
+  // reason with. But we still need to provide this function for compatibility.
+#ifndef LIBC_TARGET_ARCH_IS_NVPTX
+  return gpu::aligned_allocate(static_cast<uint32_t>(alignment), size);
+#else
+  (void)alignment;
+  (void)size;
+  return nullptr;
+#endif
 }

 } // namespace LIBC_NAMESPACE_DECL
--- a/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
+++ b/libc/test/integration/src/stdlib/gpu/CMakeLists.txt
@@ -32,6 +32,21 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
      --blocks 1024
  )

+  add_integration_test(
+    aligned_alloc
+    SUITE
+      stdlib-gpu-integration-tests
+    SRCS
+      aligned_alloc.cpp
+    DEPENDS
+      libc.src.stdlib.aligned_alloc
+      libc.src.stdlib.malloc
+      libc.src.stdlib.free
+    LOADER_ARGS
+      --threads 256
+      --blocks 128
+  )
+
  add_integration_test(
    malloc_stress
    SUITE
--- a/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
+++ b/libc/test/integration/src/stdlib/gpu/aligned_alloc.cpp
@@ -0,0 +1,29 @@
+#include "test/IntegrationTest/test.h"
+
+#include "src/__support/GPU/utils.h"
+#include "src/stdlib/aligned_alloc.h" // Adjust path if needed
+#include "src/stdlib/free.h"
+
+using namespace LIBC_NAMESPACE;
+
+TEST_MAIN(int, char **, char **) {
+  // aligned_alloc with valid alignment and size
+  void *ptr = LIBC_NAMESPACE::aligned_alloc(32, 16);
+  EXPECT_NE(ptr, nullptr);
+  EXPECT_EQ(__builtin_is_aligned(ptr, 32), 0U);
+
+  LIBC_NAMESPACE::free(ptr);
+
+  // aligned_alloc fails if alignment is not power of two
+  void *bad_align = LIBC_NAMESPACE::aligned_alloc(30, 99);
+  EXPECT_EQ(bad_align, nullptr);
+
+  // aligned_alloc with a divergent size.
+  size_t alignment = 1 << (__gpu_lane_id() % 8 + 1);
+  void *div =
+      LIBC_NAMESPACE::aligned_alloc(alignment, (gpu::get_thread_id() + 1) * 4);
+  EXPECT_NE(div, nullptr);
+  EXPECT_EQ(__builtin_is_aligned(div, alignment), 0U);
+
+  return 0;
+}