Files
clang-p2996/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
Guray Ozen ea84897ba3 [mlir][gpu] Introduce gpu.dynamic_shared_memory Op (#71546)
While the `gpu.launch` Op allows setting the size via the
`dynamic_shared_memory_size` argument, accessing the dynamic shared
memory is very convoluted. This PR implements the proposed Op,
`gpu.dynamic_shared_memory` that aims to simplify the utilization of
dynamic shared memory.

RFC:
https://discourse.llvm.org/t/rfc-simplifying-dynamic-shared-memory-access-in-gpu/

**Proposal from RFC**
This PR `gpu.dynamic.shared.memory` Op to use dynamic shared memory
feature efficiently. It is is a powerful feature that enables the
allocation of shared memory at runtime with the kernel launch on the
host. Afterwards, the memory can be accessed directly from the device. I
believe similar story exists for AMDGPU.

**Current way Using Dynamic Shared Memory with MLIR**

Let me illustrate the challenges of using dynamic shared memory in MLIR
with an example below. The process involves several steps:
- memref.global 0-sized array LLVM's NVPTX backend expects
- dynamic_shared_memory_size Set the size of dynamic shared memory
- memref.get_global Access the global symbol
- reinterpret_cast and subview Many OPs for pointer arithmetic

```
// Step 1. Create 0-sized global symbol. Manually set the alignment
memref.global "private" @dynamicShmem  : memref<0xf16, 3> { alignment = 16 }
func.func @main() {
  // Step 2. Allocate shared memory
  gpu.launch blocks(...) threads(...)
    dynamic_shared_memory_size %c10000 {
    // Step 3. Access the global object
    %shmem = memref.get_global @dynamicShmem : memref<0xf16, 3>
    // Step 4. A sequence of `memref.reinterpret_cast` and `memref.subview` operations.
    %4 = memref.reinterpret_cast %shmem to offset: [0], sizes: [14, 64, 128],  strides: [8192,128,1] : memref<0xf16, 3> to memref<14x64x128xf16,3>
    %5 = memref.subview %4[7, 0, 0][7, 64, 128][1,1,1] : memref<14x64x128xf16,3> to memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3>
    %6 = memref.subview %5[2, 0, 0][1, 64, 128][1,1,1] : memref<7x64x128xf16, strided<[8192, 128, 1], offset: 57344>, 3> to memref<64x128xf16, strided<[128, 1], offset: 73728>, 3>
    %7 = memref.subview %6[0, 0][64, 64][1,1]  : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 73728>, 3>
    %8 = memref.subview %6[32, 0][64, 64][1,1] : memref<64x128xf16, strided<[128, 1], offset: 73728>, 3> to memref<64x64xf16, strided<[128, 1], offset: 77824>, 3>
    // Step.5 Use
    "test.use.shared.memory"(%7) : (memref<64x64xf16, strided<[128, 1], offset: 73728>, 3>) -> (index)
    "test.use.shared.memory"(%8) : (memref<64x64xf16, strided<[128, 1], offset: 77824>, 3>) -> (index)
    gpu.terminator
  }
```

Let’s write the program above with that:

```
func.func @main() {
    gpu.launch blocks(...) threads(...) dynamic_shared_memory_size %c10000 {
    	%i = arith.constant 18 : index
        // Step 1: Obtain shared memory directly
        %shmem = gpu.dynamic_shared_memory : memref<?xi8, 3>
        %c147456 = arith.constant 147456 : index
        %c155648 = arith.constant 155648 : index
        %7 = memref.view %shmem[%c147456][] : memref<?xi8, 3> to memref<64x64xf16, 3>
        %8 = memref.view %shmem[%c155648][] : memref<?xi8, 3> to memref<64x64xf16, 3>

        // Step 2: Utilize the shared memory
        "test.use.shared.memory"(%7) : (memref<64x64xf16, 3>) -> (index)
        "test.use.shared.memory"(%8) : (memref<64x64xf16, 3>) -> (index)
    }
}
```

This PR resolves #72513
2023-11-16 14:42:17 +01:00

139 lines
5.4 KiB
C++

//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
namespace mlir {
/// Lowering for gpu.dynamic.shared.memory to LLVM dialect. The pattern first
/// create a 0-sized global array symbol similar as LLVM expects. It constructs
/// a memref descriptor with these values and return it.
struct GPUDynamicSharedMemoryOpLowering
: public ConvertOpToLLVMPattern<gpu::DynamicSharedMemoryOp> {
using ConvertOpToLLVMPattern<
gpu::DynamicSharedMemoryOp>::ConvertOpToLLVMPattern;
GPUDynamicSharedMemoryOpLowering(const LLVMTypeConverter &converter,
unsigned alignmentBit = 0)
: ConvertOpToLLVMPattern<gpu::DynamicSharedMemoryOp>(converter),
alignmentBit(alignmentBit) {}
LogicalResult
matchAndRewrite(gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
private:
// Alignment bit
unsigned alignmentBit;
};
struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
GPUFuncOpLowering(const LLVMTypeConverter &converter,
unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
StringAttr kernelAttributeName)
: ConvertOpToLLVMPattern<gpu::GPUFuncOp>(converter),
allocaAddrSpace(allocaAddrSpace),
workgroupAddrSpace(workgroupAddrSpace),
kernelAttributeName(kernelAttributeName) {}
LogicalResult
matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
private:
/// The address space to use for `alloca`s in private memory.
unsigned allocaAddrSpace;
/// The address space to use declaring workgroup memory.
unsigned workgroupAddrSpace;
/// The attribute name to use instead of `gpu.kernel`.
StringAttr kernelAttributeName;
};
/// The lowering of gpu.printf to a call to HIP hostcalls
///
/// Simplifies llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp, as we don't have
/// to deal with %s (even if there were first-class strings in MLIR, they're not
/// legal input to gpu.printf) or non-constant format strings
struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern<gpu::PrintfOp> {
using ConvertOpToLLVMPattern<gpu::PrintfOp>::ConvertOpToLLVMPattern;
LogicalResult
matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};
/// The lowering of gpu.printf to a call to an external printf() function
///
/// This pass will add a declaration of printf() to the GPUModule if needed
/// and seperate out the format strings into global constants. For some
/// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler
/// will lower printf calls to appropriate device-side code
struct GPUPrintfOpToLLVMCallLowering
: public ConvertOpToLLVMPattern<gpu::PrintfOp> {
GPUPrintfOpToLLVMCallLowering(const LLVMTypeConverter &converter,
int addressSpace = 0)
: ConvertOpToLLVMPattern<gpu::PrintfOp>(converter),
addressSpace(addressSpace) {}
LogicalResult
matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
private:
int addressSpace;
};
/// Lowering of gpu.printf to a vprintf standard library.
struct GPUPrintfOpToVPrintfLowering
: public ConvertOpToLLVMPattern<gpu::PrintfOp> {
using ConvertOpToLLVMPattern<gpu::PrintfOp>::ConvertOpToLLVMPattern;
LogicalResult
matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};
struct GPUReturnOpLowering : public ConvertOpToLLVMPattern<gpu::ReturnOp> {
using ConvertOpToLLVMPattern<gpu::ReturnOp>::ConvertOpToLLVMPattern;
LogicalResult
matchAndRewrite(gpu::ReturnOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, adaptor.getOperands());
return success();
}
};
namespace impl {
/// Unrolls op if it's operating on vectors.
LogicalResult scalarizeVectorOp(Operation *op, ValueRange operands,
ConversionPatternRewriter &rewriter,
const LLVMTypeConverter &converter);
} // namespace impl
/// Rewriting that unrolls SourceOp to scalars if it's operating on vectors.
template <typename SourceOp>
struct ScalarizeVectorOpLowering : public ConvertOpToLLVMPattern<SourceOp> {
public:
using ConvertOpToLLVMPattern<SourceOp>::ConvertOpToLLVMPattern;
LogicalResult
matchAndRewrite(SourceOp op, typename SourceOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
return impl::scalarizeVectorOp(op, adaptor.getOperands(), rewriter,
*this->getTypeConverter());
}
};
} // namespace mlir
#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_