From 45daa4fdc68f5faa5bd5c33da052d2415cd88540 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 17 May 2024 09:37:53 -0700
Subject: [PATCH] [flang][cuda] Move CUDA Fortran operations to a CUF dialect
 (#92317)

The number of operations dedicated to CUF grew and where all still in
FIR. In order to have a better organization, the CUF operations,
attributes and code is moved into their specific dialect and files. CUF
dialect is tightly coupled with HLFIR/FIR and their types.

The CUF attributes are bundled into their own library since some
HLFIR/FIR operations depend on them and the CUF dialect depends on the
FIR types. Without having the attributes into a separate library there
would be a dependency cycle.
---
 flang/include/flang/Lower/ConvertVariable.h   |  10 +-
 .../flang/Optimizer/Builder/FIRBuilder.h      |   4 +-
 .../flang/Optimizer/Builder/HLFIRTools.h      |   2 +-
 .../flang/Optimizer/Dialect/CMakeLists.txt    |   2 +
 .../Dialect/CUF/Attributes/CMakeLists.txt     |   7 +
 .../Dialect/CUF/Attributes/CUFAttr.h          | 106 +++++++
 .../Dialect/CUF/Attributes/CUFAttr.td         | 100 +++++++
 .../Optimizer/Dialect/CUF/CMakeLists.txt      |  11 +
 .../flang/Optimizer/Dialect/CUF/CUFDialect.h  |  26 ++
 .../flang/Optimizer/Dialect/CUF/CUFDialect.td |  43 +++
 .../flang/Optimizer/Dialect/CUF/CUFOps.h      |  20 ++
 .../flang/Optimizer/Dialect/CUF/CUFOps.td     | 263 ++++++++++++++++++
 .../flang/Optimizer/Dialect/FIRAttr.td        |  83 ------
 .../include/flang/Optimizer/Dialect/FIROps.h  |   1 +
 .../include/flang/Optimizer/Dialect/FIROps.td | 242 +---------------
 .../flang/Optimizer/Dialect/FIROpsSupport.h   |  13 -
 .../include/flang/Optimizer/HLFIR/HLFIROps.td |   5 +-
 .../include/flang/Optimizer/Support/InitFIR.h |   3 +-
 flang/include/flang/Optimizer/Support/Utils.h |  61 +---
 flang/lib/Frontend/CMakeLists.txt             |   2 +
 flang/lib/Lower/Allocatable.cpp               |  17 +-
 flang/lib/Lower/Bridge.cpp                    |  52 ++--
 flang/lib/Lower/CMakeLists.txt                |   4 +
 flang/lib/Lower/CallInterface.cpp             |  21 +-
 flang/lib/Lower/ConvertCall.cpp               |   3 +-
 flang/lib/Lower/ConvertVariable.cpp           |  57 ++--
 flang/lib/Optimizer/Builder/FIRBuilder.cpp    |   8 +-
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    |   4 +-
 flang/lib/Optimizer/Dialect/CMakeLists.txt    |   3 +
 .../Dialect/CUF/Attributes/CMakeLists.txt     |  16 ++
 .../Dialect/CUF/Attributes/CUFAttr.cpp        |  32 +++
 .../lib/Optimizer/Dialect/CUF/CMakeLists.txt  |  22 ++
 .../lib/Optimizer/Dialect/CUF/CUFDialect.cpp  |  25 ++
 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp    | 219 +++++++++++++++
 flang/lib/Optimizer/Dialect/FIRAttr.cpp       |   4 +-
 flang/lib/Optimizer/Dialect/FIRDialect.cpp    |   1 +
 flang/lib/Optimizer/Dialect/FIROps.cpp        | 163 -----------
 flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt   |   2 +
 flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp     |   4 +-
 .../Optimizer/HLFIR/Transforms/CMakeLists.txt |   2 +
 .../HLFIR/Transforms/ConvertToFIR.cpp         |   8 +-
 flang/test/Fir/cuf-invalid.fir                |  50 ++--
 flang/test/Fir/cuf.mlir                       |  46 +--
 flang/test/Lower/CUDA/cuda-allocatable.cuf    |  48 ++--
 flang/test/Lower/CUDA/cuda-data-attribute.cuf |  60 ++--
 flang/test/Lower/CUDA/cuda-data-transfer.cuf  |  50 ++--
 flang/test/Lower/CUDA/cuda-kernel-calls.cuf   |  12 +-
 .../Lower/CUDA/cuda-kernel-loop-directive.cuf |  10 +-
 flang/test/Lower/CUDA/cuda-mod.cuf            |   4 +-
 flang/test/Lower/CUDA/cuda-module-use.cuf     |   8 +-
 flang/test/Lower/CUDA/cuda-proc-attribute.cuf |  26 +-
 flang/tools/bbc/CMakeLists.txt                |   2 +
 flang/tools/fir-opt/CMakeLists.txt            |   2 +
 flang/tools/tco/CMakeLists.txt                |   2 +
 flang/unittests/Optimizer/CMakeLists.txt      |   2 +
 .../Optimizer/FortranVariableTest.cpp         |   8 +-
 56 files changed, 1184 insertions(+), 817 deletions(-)
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/CMakeLists.txt
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.h
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.td
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
 create mode 100644 flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
 create mode 100644 flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
 create mode 100644 flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
 create mode 100644 flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
 create mode 100644 flang/lib/Optimizer/Dialect/CUF/CUFDialect.cpp
 create mode 100644 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp

diff --git a/flang/include/flang/Lower/ConvertVariable.h b/flang/include/flang/Lower/ConvertVariable.h
index d70d3268acac..515f4695951b 100644
--- a/flang/include/flang/Lower/ConvertVariable.h
+++ b/flang/include/flang/Lower/ConvertVariable.h
@@ -23,6 +23,10 @@
 #include "mlir/IR/Value.h"
 #include "llvm/ADT/DenseMap.h"
 
+namespace cuf {
+class DataAttributeAttr;
+}
+
 namespace fir {
 class ExtendedValue;
 class FirOpBuilder;
@@ -146,9 +150,9 @@ translateSymbolAttributes(mlir::MLIRContext *mlirContext,
 
 /// Translate the CUDA Fortran attributes of \p sym into the FIR CUDA attribute
 /// representation.
-fir::CUDADataAttributeAttr
-translateSymbolCUDADataAttribute(mlir::MLIRContext *mlirContext,
-                                 const Fortran::semantics::Symbol &sym);
+cuf::DataAttributeAttr
+translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
+                                const Fortran::semantics::Symbol &sym);
 
 /// Map a symbol to a given fir::ExtendedValue. This will generate an
 /// hlfir.declare when lowering to HLFIR and map the hlfir.declare result to the
diff --git a/flang/include/flang/Optimizer/Builder/FIRBuilder.h b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
index 0d650f830b64..287730ef2ac8 100644
--- a/flang/include/flang/Optimizer/Builder/FIRBuilder.h
+++ b/flang/include/flang/Optimizer/Builder/FIRBuilder.h
@@ -254,13 +254,13 @@ public:
                              mlir::StringAttr linkage = {},
                              mlir::Attribute value = {}, bool isConst = false,
                              bool isTarget = false,
-                             fir::CUDADataAttributeAttr cudaAttr = {});
+                             cuf::DataAttributeAttr dataAttr = {});
 
   fir::GlobalOp createGlobal(mlir::Location loc, mlir::Type type,
                              llvm::StringRef name, bool isConst, bool isTarget,
                              std::function<void(FirOpBuilder &)> bodyBuilder,
                              mlir::StringAttr linkage = {},
-                             fir::CUDADataAttributeAttr cudaAttr = {});
+                             cuf::DataAttributeAttr dataAttr = {});
 
   /// Create a global constant (read-only) value.
   fir::GlobalOp createGlobalConstant(mlir::Location loc, mlir::Type type,
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index 6cc8e71b3b18..43aa1661550e 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -239,7 +239,7 @@ genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
            const fir::ExtendedValue &exv, llvm::StringRef name,
            fir::FortranVariableFlagsAttr flags,
            mlir::Value dummyScope = nullptr,
-           fir::CUDADataAttributeAttr cudaAttr = {});
+           cuf::DataAttributeAttr dataAttr = {});
 
 /// Generate an hlfir.associate to build a variable from an expression value.
 /// The type of the variable must be provided so that scalar logicals are
diff --git a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
index f00993d4d377..301a93c1fe5b 100644
--- a/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/include/flang/Optimizer/Dialect/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(CUF)
+
 # This replicates part of the add_mlir_dialect cmake function from MLIR that
 # cannot be used her because it expects to be run inside MLIR directory which
 # is not the case for FIR.
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
new file mode 100644
index 000000000000..bae7fe3484f4
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LLVM_TARGET_DEFINITIONS CUFAttr.td)
+mlir_tablegen(CUFEnumAttr.h.inc -gen-enum-decls)
+mlir_tablegen(CUFEnumAttr.cpp.inc -gen-enum-defs)
+mlir_tablegen(CUFAttr.h.inc --gen-attrdef-decls)
+mlir_tablegen(CUFAttr.cpp.inc -gen-attrdef-defs)
+
+add_public_tablegen_target(CUFAttrsIncGen)
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
new file mode 100644
index 000000000000..f32e39b543e3
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h
@@ -0,0 +1,106 @@
+//===-- Optimizer/Dialect/CUF/Attributes/CUFAttr.h -- CUF attributes ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
+#define FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
+
+#include "flang/Common/Fortran.h"
+#include "mlir/IR/BuiltinAttributes.h"
+
+namespace llvm {
+class StringRef;
+}
+
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFEnumAttr.h.inc"
+
+#define GET_ATTRDEF_CLASSES
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h.inc"
+
+namespace cuf {
+
+/// Attribute to mark Fortran entities with the CUDA attribute.
+static constexpr llvm::StringRef getDataAttrName() { return "cuf.data_attr"; }
+static constexpr llvm::StringRef getProcAttrName() { return "cuf.proc_attr"; }
+
+/// Attribute to carry CUDA launch_bounds values.
+static constexpr llvm::StringRef getLaunchBoundsAttrName() {
+  return "cuf.launch_bounds";
+}
+
+/// Attribute to carry CUDA cluster_dims values.
+static constexpr llvm::StringRef getClusterDimsAttrName() {
+  return "cuf.cluster_dims";
+}
+
+inline cuf::DataAttributeAttr
+getDataAttribute(mlir::MLIRContext *mlirContext,
+                 std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
+  if (cudaAttr) {
+    cuf::DataAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDADataAttr::Constant:
+      attr = cuf::DataAttribute::Constant;
+      break;
+    case Fortran::common::CUDADataAttr::Device:
+      attr = cuf::DataAttribute::Device;
+      break;
+    case Fortran::common::CUDADataAttr::Managed:
+      attr = cuf::DataAttribute::Managed;
+      break;
+    case Fortran::common::CUDADataAttr::Pinned:
+      attr = cuf::DataAttribute::Pinned;
+      break;
+    case Fortran::common::CUDADataAttr::Shared:
+      attr = cuf::DataAttribute::Shared;
+      break;
+    case Fortran::common::CUDADataAttr::Texture:
+      // Obsolete attribute
+      return {};
+    case Fortran::common::CUDADataAttr::Unified:
+      attr = cuf::DataAttribute::Unified;
+      break;
+    }
+    return cuf::DataAttributeAttr::get(mlirContext, attr);
+  }
+  return {};
+}
+
+inline cuf::ProcAttributeAttr
+getProcAttribute(mlir::MLIRContext *mlirContext,
+                 std::optional<Fortran::common::CUDASubprogramAttrs> cudaAttr) {
+  if (cudaAttr) {
+    cuf::ProcAttribute attr;
+    switch (*cudaAttr) {
+    case Fortran::common::CUDASubprogramAttrs::Host:
+      attr = cuf::ProcAttribute::Host;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Device:
+      attr = cuf::ProcAttribute::Device;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::HostDevice:
+      attr = cuf::ProcAttribute::HostDevice;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Global:
+      attr = cuf::ProcAttribute::Global;
+      break;
+    case Fortran::common::CUDASubprogramAttrs::Grid_Global:
+      attr = cuf::ProcAttribute::GridGlobal;
+      break;
+    }
+    return cuf::ProcAttributeAttr::get(mlirContext, attr);
+  }
+  return {};
+}
+
+} // namespace cuf
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFATTR_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td
new file mode 100644
index 000000000000..8e2b54672527
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td
@@ -0,0 +1,100 @@
+//===- CUFAttr.td - CUF Attributes -------------------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CUF dialect attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_CUF_CUFATTRS
+#define FORTRAN_DIALECT_CUF_CUFATTRS
+
+include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
+include "mlir/IR/EnumAttr.td"
+
+class cuf_Attr<string name> : AttrDef<CUFDialect, name>;
+
+def cuf_DataAttribute : I32EnumAttr<
+    "DataAttribute",
+    "CUDA Fortran variable attributes",
+    [
+      I32EnumAttrCase<"Constant", 0, "constant">,
+      I32EnumAttrCase<"Device", 1, "device">,
+      I32EnumAttrCase<"Managed", 2, "managed">,
+      I32EnumAttrCase<"Pinned", 3, "pinned">,
+      I32EnumAttrCase<"Shared", 4, "shared">,
+      I32EnumAttrCase<"Unified", 5, "unified">,
+      // Texture is omitted since it is obsolete and rejected by semantic.
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::cuf";
+}
+
+def cuf_DataAttributeAttr :
+    EnumAttr<CUFDialect, cuf_DataAttribute, "cuda"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
+def cuf_ProcAttribute : I32EnumAttr<
+    "ProcAttribute", "CUDA Fortran procedure attributes",
+    [
+      I32EnumAttrCase<"Host", 0, "host">,
+      I32EnumAttrCase<"Device", 1, "device">,
+      I32EnumAttrCase<"HostDevice", 2, "host_device">,
+      I32EnumAttrCase<"Global", 3, "global">,
+      I32EnumAttrCase<"GridGlobal", 4, "grid_global">,
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::cuf";
+}
+
+def cuf_ProcAttributeAttr :
+    EnumAttr<CUFDialect, cuf_ProcAttribute, "cuda_proc"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
+def cuf_LaunchBoundsAttr : cuf_Attr<"LaunchBounds"> {
+  let mnemonic = "launch_bounds";
+
+  let parameters = (ins
+    "mlir::IntegerAttr":$maxTPB,
+    "mlir::IntegerAttr":$minBPM,
+    OptionalParameter<"mlir::IntegerAttr">:$upperBoundClusterSize
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def cuf_ClusterDimsAttr : cuf_Attr<"ClusterDims"> {
+  let mnemonic = "cluster_dims";
+
+  let parameters = (ins
+    "mlir::IntegerAttr":$x,
+    "mlir::IntegerAttr":$y,
+    "mlir::IntegerAttr":$z
+  );
+
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def cuf_DataTransferKind : I32EnumAttr<
+    "DataTransferKind", "CUDA Fortran data transfer kind",
+    [
+      I32EnumAttrCase<"DeviceHost", 0, "device_host">,
+      I32EnumAttrCase<"HostDevice", 1, "host_device">,
+      I32EnumAttrCase<"DeviceDevice", 2, "device_device">,
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::cuf";
+}
+
+def cuf_DataTransferKindAttr :
+    EnumAttr<CUFDialect, cuf_DataTransferKind, "cuda_transfer"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
+#endif // FORTRAN_DIALECT_CUF_CUFATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/include/flang/Optimizer/Dialect/CUF/CMakeLists.txt
new file mode 100644
index 000000000000..07490c7b9ca2
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_subdirectory(Attributes)
+
+set(LLVM_TARGET_DEFINITIONS CUFDialect.td)
+mlir_tablegen(CUFDialect.h.inc -gen-dialect-decls -dialect=cuf)
+mlir_tablegen(CUFDialect.cpp.inc -gen-dialect-defs -dialect=cuf)
+
+set(LLVM_TARGET_DEFINITIONS CUFOps.td)
+mlir_tablegen(CUFOps.h.inc -gen-op-decls)
+mlir_tablegen(CUFOps.cpp.inc -gen-op-defs)
+
+add_public_tablegen_target(CUFOpsIncGen)
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.h b/flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.h
new file mode 100644
index 000000000000..cf562b226835
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.h
@@ -0,0 +1,26 @@
+//===-- Optimizer/Dialect/CUFDialect.h -- CUF dialect -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_CUF_CUFDIALECT_H
+#define FORTRAN_OPTIMIZER_DIALECT_CUF_CUFDIALECT_H
+
+#include "mlir/Bytecode/BytecodeOpInterface.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h.inc"
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFDIALECT_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.td
new file mode 100644
index 000000000000..df866e566406
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFDialect.td
@@ -0,0 +1,43 @@
+//===-- CUFDialect.td - CUF dialect base definitions -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Definition of the CUDA Fortran dialect
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_CUF_CUFDIALECT
+#define FORTRAN_DIALECT_CUF_CUFDIALECT
+
+include "mlir/IR/AttrTypeBase.td"
+include "mlir/IR/EnumAttr.td"
+include "mlir/IR/OpBase.td"
+
+def CUFDialect : Dialect {
+  let name = "cuf";
+
+  let summary = "CUDA Fortran dialect";
+
+  let description = [{
+    This dialect models CUDA Fortran operations. The CUF dialect operations use
+    the FIR types and are tightly coupled with FIR and HLFIR.
+  }];
+
+  let useDefaultAttributePrinterParser = 1;
+  let usePropertiesForAttributes = 1;
+  let cppNamespace = "::cuf";
+  let dependentDialects = ["fir::FIROpsDialect"];
+
+  let extraClassDeclaration = [{
+  private:
+    // Register the CUF Attributes.
+    void registerAttributes();
+  }];
+}
+
+#endif // FORTRAN_DIALECT_CUF_CUFDIALECT
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
new file mode 100644
index 000000000000..4132db672e39
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.h
@@ -0,0 +1,20 @@
+//===-- Optimizer/Dialect/CUF/CUFOps.h - CUF operations ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
+#define FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
+
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "mlir/IR/OpDefinition.h"
+
+#define GET_OP_CLASSES
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h.inc"
+
+#endif // FORTRAN_OPTIMIZER_DIALECT_CUF_CUFOPS_H
diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
new file mode 100644
index 000000000000..72157bce4f76
--- /dev/null
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -0,0 +1,263 @@
+//===-- CUFOps.td - CUF operation definitions --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Definition of the CUF dialect operations
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_DIALECT_CUF_CUF_OPS
+#define FORTRAN_DIALECT_CUF_CUF_OPS
+
+include "flang/Optimizer/Dialect/CUF/CUFDialect.td"
+include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
+include "flang/Optimizer/Dialect/FIRTypes.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
+include "mlir/IR/BuiltinAttributes.td"
+
+class cuf_Op<string mnemonic, list<Trait> traits>
+    : Op<CUFDialect, mnemonic, traits>;
+
+def cuf_AllocOp : cuf_Op<"alloc", [AttrSizedOperandSegments,
+    MemoryEffects<[MemAlloc]>]> {
+  let summary = "Allocate an object on device";
+
+  let description = [{
+    This is a drop in replacement for fir.alloca and fir.allocmem for device
+    object. Any device, managed or unified object declared in an host
+    subprogram needs to be allocated in the device memory through runtime calls.
+    The cuf.alloc is an abstraction to the runtime calls and works together
+    with cuf.free.
+  }];
+
+  let arguments = (ins
+    TypeAttr:$in_type,
+    OptionalAttr<StrAttr>:$uniq_name,
+    OptionalAttr<StrAttr>:$bindc_name,
+    Variadic<AnyIntegerType>:$typeparams,
+    Variadic<AnyIntegerType>:$shape,
+    cuf_DataAttributeAttr:$data_attr
+  );
+
+  let results = (outs fir_ReferenceType:$ptr);
+
+  let assemblyFormat = [{
+    $in_type (`(` $typeparams^ `:` type($typeparams) `)`)?
+        (`,` $shape^ `:` type($shape) )?  attr-dict `->` qualified(type($ptr))
+  }];
+
+  let builders = [
+    OpBuilder<(ins "mlir::Type":$inType, "llvm::StringRef":$uniqName,
+      "llvm::StringRef":$bindcName,
+      "cuf::DataAttributeAttr":$cudaAttr,
+      CArg<"mlir::ValueRange", "{}">:$typeparams,
+      CArg<"mlir::ValueRange", "{}">:$shape,
+      CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
+
+  let hasVerifier = 1;
+}
+
+def cuf_FreeOp : cuf_Op<"free", [MemoryEffects<[MemFree]>]> {
+  let summary = "Free a device allocated object";
+
+  let description = [{
+    The cuf.free operation frees the memory allocated by cuf.alloc.
+    This is used for non-allocatable device, managed and unified device
+    variables declare in host subprogram.
+  }];
+
+  let arguments = (ins
+    Arg<AnyReferenceLike, "", [MemFree]>:$devptr,
+    cuf_DataAttributeAttr:$data_attr
+  );
+
+  let assemblyFormat = "$devptr `:` qualified(type($devptr)) attr-dict";
+
+  let hasVerifier = 1;
+}
+
+def cuf_AllocateOp : cuf_Op<"allocate", [AttrSizedOperandSegments,
+    MemoryEffects<[MemAlloc<DefaultResource>]>]> {
+  let summary = "Perform the device allocation of data of an allocatable";
+
+  let description = [{
+    The cuf.allocate operation performs the allocation on the device
+    of the data of an allocatable. The descriptor passed to the operation
+    is initialized before with the standard flang runtime calls.
+  }];
+
+  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+                       Optional<AnyIntegerType>:$stream,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
+                       cuf_DataAttributeAttr:$data_attr,
+                       UnitAttr:$hasStat);
+
+  let results = (outs AnyIntegerType:$stat);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box))
+    ( `source` `(` $source^ `:` qualified(type($source) )`)` )?
+    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
+    ( `stream` `(` $stream^ `:` type($stream) `)` )?
+    ( `pinned` `(` $pinned^ `:` type($pinned) `)` )?
+    attr-dict `->` type($stat)
+  }];
+
+  let hasVerifier = 1;
+}
+
+def cuf_DeallocateOp : cuf_Op<"deallocate",
+    [MemoryEffects<[MemFree<DefaultResource>]>]> {
+  let summary = "Perform the device deallocation of data of an allocatable";
+
+  let description = [{
+    The cuf.deallocate operation performs the deallocation on the device
+    of the data of an allocatable.
+  }];
+
+  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
+                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
+                       cuf_DataAttributeAttr:$data_attr,
+                       UnitAttr:$hasStat);
+
+  let results = (outs AnyIntegerType:$stat);
+
+  let assemblyFormat = [{
+    $box `:` qualified(type($box))
+    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
+    attr-dict `->` type($stat)
+  }];
+
+  let hasVerifier = 1;
+}
+
+def cuf_DataTransferOp : cuf_Op<"data_transfer", []> {
+  let summary = "Represent a data transfer between host and device memory";
+
+  let description = [{
+    CUDA Fortran allows data transfer to be done via intrinsic assignment
+    between a host and a device variable. This operation is used to materialized
+    the data transfer between the lhs and rhs memory references.
+    The kind of transfer is specified in the attribute. 
+
+    ```
+      adev = a ! transfer host to device
+      a = adev ! transfer device to host
+      bdev = adev ! transfer device to device
+    ```
+  }];
+
+  let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
+                       Arg<AnyReferenceLike, "", [MemRead]>:$dst,
+                       cuf_DataTransferKindAttr:$transfer_kind);
+
+  let assemblyFormat = [{
+    $src `to` $dst attr-dict `:` type(operands)
+  }];
+}
+
+def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
+    AttrSizedOperandSegments]> {
+  let summary = "call CUDA kernel";
+
+  let description = [{
+    Launch a CUDA kernel from the host.
+
+    ```
+      // launch simple kernel with no arguments. bytes and stream value are
+      // optional in the chevron notation.
+      cuf.kernel_launch @kernel<<<%gx, %gy, %bx, %by, %bz>>>()
+    ```
+  }];
+
+  let arguments = (ins
+    SymbolRefAttr:$callee,
+    I32:$grid_x,
+    I32:$grid_y,
+    I32:$grid_z,
+    I32:$block_x,
+    I32:$block_y,
+    I32:$block_z,
+    Optional<I32>:$bytes,
+    Optional<I32>:$stream,
+    Variadic<AnyType>:$args
+  );
+
+  let assemblyFormat = [{
+    $callee `<` `<` `<` $grid_x `,` $grid_y `,` $grid_z `,`$block_x `,`
+        $block_y `,` $block_z ( `,` $bytes^ ( `,` $stream^ )? )? `>` `>` `>`
+        `` `(` $args `)` ( `:` `(` type($args)^ `)` )? attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    mlir::CallInterfaceCallable getCallableForCallee() {
+      return getCalleeAttr();
+    }
+
+    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+      (*this)->setAttr(getCalleeAttrName(), callee.get<mlir::SymbolRefAttr>());
+    }
+    mlir::FunctionType getFunctionType();
+
+    unsigned getNbNoArgOperand() {
+      unsigned nbNoArgOperand = 5; // grids and blocks values are always present.
+      if (getBytes()) ++nbNoArgOperand;
+      if (getStream()) ++nbNoArgOperand;
+      return nbNoArgOperand;
+    }
+
+    operand_range getArgOperands() {
+      return {operand_begin() + getNbNoArgOperand(), operand_end()};
+    }
+    mlir::MutableOperandRange getArgOperandsMutable() {
+      return mlir::MutableOperandRange(
+          *this, getNbNoArgOperand(), getArgs().size() - 1);
+    }
+  }];
+}
+
+def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
+    DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
+
+  let description = [{
+    Represent the CUDA Fortran kernel directive. The operation is a loop like
+    operation that represents the iteration range of the embedded loop nest.
+
+    When grid or block variadic operands are empty, a `*` only syntax was used
+    in the Fortran code.
+    If the `*` is mixed with values for either grid or block, these are
+    represented by a 0 constant value.
+  }];
+
+  let arguments = (ins
+    Variadic<I32>:$grid, // empty means `*`
+    Variadic<I32>:$block, // empty means `*`
+    Optional<I32>:$stream,
+    Variadic<Index>:$lowerbound,
+    Variadic<Index>:$upperbound,
+    Variadic<Index>:$step,
+    OptionalAttr<I64Attr>:$n
+  );
+
+  let regions = (region AnyRegion:$region);
+
+  let assemblyFormat = [{
+    `<` `<` `<` custom<CUFKernelValues>($grid, type($grid)) `,` 
+                custom<CUFKernelValues>($block, type($block))
+        ( `,` `stream` `=` $stream^ )? `>` `>` `>`
+        custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
+            $upperbound, type($upperbound), $step, type($step))
+        attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
+#endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
index f8b3fb861cc6..989319ff3dda 100644
--- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td
+++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td
@@ -70,87 +70,4 @@ def fir_BoxFieldAttr : I32EnumAttr<
 // mlir::SideEffects::Resource for modelling operations which add debugging information
 def DebuggingResource : Resource<"::fir::DebuggingResource">;
 
-//===----------------------------------------------------------------------===//
-// CUDA Fortran specific attributes
-//===----------------------------------------------------------------------===//
-
-def fir_CUDADataAttribute : I32EnumAttr<
-    "CUDADataAttribute",
-    "CUDA Fortran variable attributes",
-    [
-      I32EnumAttrCase<"Constant", 0, "constant">,
-      I32EnumAttrCase<"Device", 1, "device">,
-      I32EnumAttrCase<"Managed", 2, "managed">,
-      I32EnumAttrCase<"Pinned", 3, "pinned">,
-      I32EnumAttrCase<"Shared", 4, "shared">,
-      I32EnumAttrCase<"Unified", 5, "unified">,
-      // Texture is omitted since it is obsolete and rejected by semantic.
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::fir";
-}
-
-def fir_CUDADataAttributeAttr :
-    EnumAttr<FIROpsDialect, fir_CUDADataAttribute, "cuda"> {
-  let assemblyFormat = [{ ```<` $value `>` }];
-}
-
-def fir_CUDAProcAttribute : I32EnumAttr<
-    "CUDAProcAttribute", "CUDA Fortran procedure attributes",
-    [
-      I32EnumAttrCase<"Host", 0, "host">,
-      I32EnumAttrCase<"Device", 1, "device">,
-      I32EnumAttrCase<"HostDevice", 2, "host_device">,
-      I32EnumAttrCase<"Global", 3, "global">,
-      I32EnumAttrCase<"GridGlobal", 4, "grid_global">,
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::fir";
-}
-
-def fir_CUDAProcAttributeAttr :
-    EnumAttr<FIROpsDialect, fir_CUDAProcAttribute, "cuda_proc"> {
-  let assemblyFormat = [{ ```<` $value `>` }];
-}
-
-def fir_CUDALaunchBoundsAttr : fir_Attr<"CUDALaunchBounds"> {
-  let mnemonic = "launch_bounds";
-
-  let parameters = (ins
-    "mlir::IntegerAttr":$maxTPB,
-    "mlir::IntegerAttr":$minBPM,
-    OptionalParameter<"mlir::IntegerAttr">:$upperBoundClusterSize
-  );
-
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def fir_CUDAClusterDimsAttr : fir_Attr<"CUDAClusterDims"> {
-  let mnemonic = "cluster_dims";
-
-  let parameters = (ins
-    "mlir::IntegerAttr":$x,
-    "mlir::IntegerAttr":$y,
-    "mlir::IntegerAttr":$z
-  );
-
-  let assemblyFormat = "`<` struct(params) `>`";
-}
-
-def fir_CUDADataTransferKind : I32EnumAttr<
-    "CUDADataTransferKind", "CUDA Fortran data transfer kind",
-    [
-      I32EnumAttrCase<"DeviceHost", 0, "device_host">,
-      I32EnumAttrCase<"HostDevice", 1, "host_device">,
-      I32EnumAttrCase<"DeviceDevice", 2, "device_device">,
-    ]> {
-  let genSpecializedAttr = 0;
-  let cppNamespace = "::fir";
-}
-
-def fir_CUDADataTransferKindAttr :
-    EnumAttr<FIROpsDialect, fir_CUDADataTransferKind, "cuda_transfer"> {
-  let assemblyFormat = [{ ```<` $value `>` }];
-}
-
 #endif // FIR_DIALECT_FIR_ATTRS
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.h b/flang/include/flang/Optimizer/Dialect/FIROps.h
index 016ad0433ed8..9f07364ddb62 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.h
@@ -9,6 +9,7 @@
 #ifndef FORTRAN_OPTIMIZER_DIALECT_FIROPS_H
 #define FORTRAN_OPTIMIZER_DIALECT_FIROPS_H
 
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Dialect/FirAliasTagOpInterface.h"
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index 64c5e360b28f..d9c114904006 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -17,6 +17,7 @@
 include "mlir/Dialect/Arith/IR/ArithBase.td"
 include "mlir/Dialect/Arith/IR/ArithOpsInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMAttrDefs.td"
+include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRDialect.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
 include "flang/Optimizer/Dialect/FIRAttr.td"
@@ -2436,66 +2437,6 @@ def fir_DispatchOp : fir_Op<"dispatch", []> {
   }];
 }
 
-def fir_CUDAKernelLaunch : fir_Op<"cuda_kernel_launch", [CallOpInterface,
-    AttrSizedOperandSegments]> {
-  let summary = "call CUDA kernel";
-
-  let description = [{
-    Launch a CUDA kernel from the host.
-
-    ```
-      // launch simple kernel with no arguments. bytes and stream value are
-      // optional in the chevron notation.
-      fir.cuda_kernel_launch @kernel<<<%gx, %gy, %bx, %by, %bz>>>()
-    ```
-  }];
-
-  let arguments = (ins
-    SymbolRefAttr:$callee,
-    I32:$grid_x,
-    I32:$grid_y,
-    I32:$grid_z,
-    I32:$block_x,
-    I32:$block_y,
-    I32:$block_z,
-    Optional<I32>:$bytes,
-    Optional<I32>:$stream,
-    Variadic<AnyType>:$args
-  );
-
-  let assemblyFormat = [{
-    $callee `<` `<` `<` $grid_x `,` $grid_y `,` $grid_z `,`$block_x `,`
-        $block_y `,` $block_z ( `,` $bytes^ ( `,` $stream^ )? )? `>` `>` `>`
-        `` `(` $args `)` ( `:` `(` type($args)^ `)` )? attr-dict
-  }];
-
-  let extraClassDeclaration = [{
-    mlir::CallInterfaceCallable getCallableForCallee() {
-      return getCalleeAttr();
-    }
-
-    void setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
-      (*this)->setAttr(getCalleeAttrName(), callee.get<mlir::SymbolRefAttr>());
-    }
-    mlir::FunctionType getFunctionType();
-
-    unsigned getNbNoArgOperand() {
-      unsigned nbNoArgOperand = 5; // grids and blocks values are always present.
-      if (getBytes()) ++nbNoArgOperand;
-      if (getStream()) ++nbNoArgOperand;
-      return nbNoArgOperand;
-    }
-
-    operand_range getArgOperands() {
-      return {operand_begin() + getNbNoArgOperand(), operand_end()};
-    }
-    mlir::MutableOperandRange getArgOperandsMutable() {
-      return mlir::MutableOperandRange(
-          *this, getNbNoArgOperand(), getArgs().size() - 1);
-    }
-  }];
-}
-
 // Constant operations that support Fortran
 
 def fir_StringLitOp : fir_Op<"string_lit", [NoMemoryEffect]> {
@@ -2797,7 +2738,7 @@ def fir_GlobalOp : fir_Op<"global", [IsolatedFromAbove, Symbol]> {
     OptionalAttr<UnitAttr>:$constant,
     OptionalAttr<UnitAttr>:$target,
     OptionalAttr<StrAttr>:$linkName,
-    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
+    OptionalAttr<cuf_DataAttributeAttr>:$data_attr
   );
 
   let regions = (region AtMostRegion<1>:$region);
@@ -3077,7 +3018,7 @@ def fir_DeclareOp : fir_Op<"declare", [AttrSizedOperandSegments,
     Optional<fir_DummyScopeType>:$dummy_scope,
     Builtin_StringAttr:$uniq_name,
     OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
-    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
+    OptionalAttr<cuf_DataAttributeAttr>:$data_attr
   );
 
   let results = (outs AnyRefOrBox);
@@ -3130,125 +3071,6 @@ def fir_BoxOffsetOp : fir_Op<"box_offset", [NoMemoryEffect]> {
   ];
 }
 
-def fir_CUDAKernelOp : fir_Op<"cuda_kernel", [AttrSizedOperandSegments,
-    DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
-
-  let description = [{
-    Represent the CUDA Fortran kernel directive. The operation is a loop like
-    operation that represents the iteration range of the embedded loop nest.
-
-    When grid or block variadic operands are empty, a `*` only syntax was used
-    in the Fortran code.
-    If the `*` is mixed with values for either grid or block, these are
-    represented by a 0 constant value.
-  }];
-
-  let arguments = (ins
-    Variadic<I32>:$grid, // empty means `*`
-    Variadic<I32>:$block, // empty means `*`
-    Optional<I32>:$stream,
-    Variadic<Index>:$lowerbound,
-    Variadic<Index>:$upperbound,
-    Variadic<Index>:$step,
-    OptionalAttr<I64Attr>:$n
-  );
-
-  let regions = (region AnyRegion:$region);
-
-  let assemblyFormat = [{
-    `<` `<` `<` custom<CUFKernelValues>($grid, type($grid)) `,` 
-                custom<CUFKernelValues>($block, type($block))
-        ( `,` `stream` `=` $stream^ )? `>` `>` `>`
-        custom<CUFKernelLoopControl>($region, $lowerbound, type($lowerbound),
-            $upperbound, type($upperbound), $step, type($step))
-        attr-dict
-  }];
-
-  let hasVerifier = 1;
-}
-
-def fir_CUDADataTransferOp : fir_Op<"cuda_data_transfer", []> {
-  let summary = "Represent a data transfer between host and device memory";
-
-  let description = [{
-    CUDA Fortran allows data transfer to be done via intrinsic assignment
-    between a host and a device variable. This operation is used to materialized
-    the data transfer between the lhs and rhs memory references.
-    The kind of transfer is specified in the attribute. 
-
-    ```
-      adev = a ! transfer host to device
-      a = adev ! transfer device to host
-      bdev = adev ! transfer device to device
-    ```
-  }];
-
-  let arguments = (ins Arg<AnyReferenceLike, "", [MemWrite]>:$src,
-                       Arg<AnyReferenceLike, "", [MemRead]>:$dst,
-                       fir_CUDADataTransferKindAttr:$transfer_kind);
-
-  let assemblyFormat = [{
-    $src `to` $dst attr-dict `:` type(operands)
-  }];
-}
-
-def fir_CUDAAllocateOp : fir_Op<"cuda_allocate", [AttrSizedOperandSegments,
-    MemoryEffects<[MemAlloc<DefaultResource>]>]> {
-  let summary = "Perform the device allocation of data of an allocatable";
-
-  let description = [{
-    The fir.cuda_allocate operation performs the allocation on the device
-    of the data of an allocatable. The descriptor passed to the operation
-    is initialized before with the standard flang runtime calls.
-  }];
-
-  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
-                       Optional<AnyIntegerType>:$stream,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$pinned,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemRead]>:$source,
-                       fir_CUDADataAttributeAttr:$cuda_attr,
-                       UnitAttr:$hasStat);
-
-  let results = (outs AnyIntegerType:$stat);
-
-  let assemblyFormat = [{
-    $box `:` qualified(type($box))
-    ( `source` `(` $source^ `:` qualified(type($source) )`)` )?
-    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
-    ( `stream` `(` $stream^ `:` type($stream) `)` )?
-    ( `pinned` `(` $pinned^ `:` type($pinned) `)` )?
-    attr-dict `->` type($stat)
-  }];
-
-  let hasVerifier = 1;
-}
-
-def fir_CUDADeallocateOp : fir_Op<"cuda_deallocate",
-    [MemoryEffects<[MemFree<DefaultResource>]>]> {
-  let summary = "Perform the device deallocation of data of an allocatable";
-
-  let description = [{
-    The fir.cuda_deallocate operation performs the deallocation on the device
-    of the data of an allocatable.
-  }];
-
-  let arguments = (ins Arg<fir_ReferenceType, "", [MemRead, MemWrite]>:$box,
-                       Arg<Optional<AnyRefOrBoxType>, "", [MemWrite]>:$errmsg,
-                       fir_CUDADataAttributeAttr:$cuda_attr,
-                       UnitAttr:$hasStat);
-
-  let results = (outs AnyIntegerType:$stat);
-
-  let assemblyFormat = [{
-    $box `:` qualified(type($box))
-    ( `errmsg` `(` $errmsg^ `:` type($errmsg) `)` )?
-    attr-dict `->` type($stat)
-  }];
-
-  let hasVerifier = 1;
-}
-
 def fir_DummyScopeOp : fir_Op<"dummy_scope",
     [MemoryEffects<[MemWrite<DebuggingResource>]>]> {
   let summary = "Define a scope for dummy arguments";
@@ -3329,62 +3151,4 @@ def fir_DummyScopeOp : fir_Op<"dummy_scope",
   let assemblyFormat = "attr-dict `:` type(results)";
 }
 
-def fir_CUDAAllocOp : fir_Op<"cuda_alloc", [AttrSizedOperandSegments,
-    MemoryEffects<[MemAlloc]>]> {
-  let summary = "Allocate an object on device";
-
-  let description = [{
-    This is a drop in replacement for fir.alloca and fir.allocmem for device
-    object. Any device, managed or unified object declared in an host
-    subprogram needs to be allocated in the device memory through runtime calls.
-    The fir.cuda_alloc is an abstraction to the runtime calls and works together
-    with fir.cuda_free.
-  }];
-
-  let arguments = (ins
-    TypeAttr:$in_type,
-    OptionalAttr<StrAttr>:$uniq_name,
-    OptionalAttr<StrAttr>:$bindc_name,
-    Variadic<AnyIntegerType>:$typeparams,
-    Variadic<AnyIntegerType>:$shape,
-    fir_CUDADataAttributeAttr:$cuda_attr
-  );
-
-  let results = (outs fir_ReferenceType:$ptr);
-
-  let assemblyFormat = [{
-    $in_type (`(` $typeparams^ `:` type($typeparams) `)`)?
-        (`,` $shape^ `:` type($shape) )?  attr-dict `->` qualified(type($ptr))
-  }];
-
-  let builders = [
-    OpBuilder<(ins "mlir::Type":$inType, "llvm::StringRef":$uniqName,
-      "llvm::StringRef":$bindcName,
-      "fir::CUDADataAttributeAttr":$cudaAttr,
-      CArg<"mlir::ValueRange", "{}">:$typeparams,
-      CArg<"mlir::ValueRange", "{}">:$shape,
-      CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)>];
-
-  let hasVerifier = 1;
-}
-
-def fir_CUDAFreeOp : fir_Op<"cuda_free", [MemoryEffects<[MemFree]>]> {
-  let summary = "Free a device allocated object";
-
-  let description = [{
-    The fir.cuda_free operation frees the memory allocated by fir.cuda_alloc.
-    This is used for non-allocatable device, managed and unified device
-    variables declare in host subprogram.
-  }];
-
-  let arguments = (ins
-    Arg<AnyReferenceLike, "", [MemFree]>:$devptr,
-    fir_CUDADataAttributeAttr:$cuda_attr
-  );
-
-  let assemblyFormat = "$devptr `:` qualified(type($devptr)) attr-dict";
-
-  let hasVerifier = 1;
-}
-
 #endif
diff --git a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
index 46b62d8de8d3..b68a39bf374b 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
+++ b/flang/include/flang/Optimizer/Dialect/FIROpsSupport.h
@@ -75,19 +75,6 @@ constexpr llvm::StringRef getOptionalAttrName() { return "fir.optional"; }
 /// Attribute to mark Fortran entities with the TARGET attribute.
 static constexpr llvm::StringRef getTargetAttrName() { return "fir.target"; }
 
-/// Attribute to mark Fortran entities with the CUDA attribute.
-static constexpr llvm::StringRef getCUDAAttrName() { return "fir.cuda_attr"; }
-
-/// Attribute to carry CUDA launch_bounds values.
-static constexpr llvm::StringRef getCUDALaunchBoundsAttrName() {
-  return "fir.cuda_launch_bounds";
-}
-
-/// Attribute to carry CUDA cluster_dims values.
-static constexpr llvm::StringRef getCUDAClusterDimsAttrName() {
-  return "fir.cuda_cluster_dims";
-}
-
 /// Attribute to mark that a function argument is a character dummy procedure.
 /// Character dummy procedure have special ABI constraints.
 static constexpr llvm::StringRef getCharacterProcedureDummyAttrName() {
diff --git a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
index 376417e3c353..b537d9e11ef8 100644
--- a/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
+++ b/flang/include/flang/Optimizer/HLFIR/HLFIROps.td
@@ -15,6 +15,7 @@
 #define FORTRAN_DIALECT_HLFIR_OPS
 
 include "flang/Optimizer/HLFIR/HLFIROpBase.td"
+include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.td"
 include "flang/Optimizer/Dialect/FIRTypes.td"
 include "flang/Optimizer/Dialect/FIRAttr.td"
 include "flang/Optimizer/Dialect/FortranVariableInterface.td"
@@ -90,7 +91,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
     Optional<fir_DummyScopeType>:$dummy_scope,
     Builtin_StringAttr:$uniq_name,
     OptionalAttr<fir_FortranVariableFlagsAttr>:$fortran_attrs,
-    OptionalAttr<fir_CUDADataAttributeAttr>:$cuda_attr
+    OptionalAttr<cuf_DataAttributeAttr>:$data_attr
   );
 
   let results = (outs AnyFortranVariable, AnyRefOrBoxLike);
@@ -106,7 +107,7 @@ def hlfir_DeclareOp : hlfir_Op<"declare", [AttrSizedOperandSegments,
       CArg<"mlir::Value", "{}">:$shape, CArg<"mlir::ValueRange", "{}">:$typeparams,
       CArg<"mlir::Value", "{}">:$dummy_scope,
       CArg<"fir::FortranVariableFlagsAttr", "{}">:$fortran_attrs,
-      CArg<"fir::CUDADataAttributeAttr", "{}">:$cuda_attr)>];
+      CArg<"cuf::DataAttributeAttr", "{}">:$data_attr)>];
 
   let extraClassDeclaration = [{
     /// Get the variable original base (same as input). It lacks
diff --git a/flang/include/flang/Optimizer/Support/InitFIR.h b/flang/include/flang/Optimizer/Support/InitFIR.h
index 9f4c4ed28a4a..48cc1cbc6456 100644
--- a/flang/include/flang/Optimizer/Support/InitFIR.h
+++ b/flang/include/flang/Optimizer/Support/InitFIR.h
@@ -13,6 +13,7 @@
 #ifndef FORTRAN_OPTIMIZER_SUPPORT_INITFIR_H
 #define FORTRAN_OPTIMIZER_SUPPORT_INITFIR_H
 
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/HLFIR/HLFIRDialect.h"
 #include "mlir/Conversion/Passes.h"
@@ -34,7 +35,7 @@ namespace fir::support {
       mlir::scf::SCFDialect, mlir::arith::ArithDialect,                        \
       mlir::cf::ControlFlowDialect, mlir::func::FuncDialect,                   \
       mlir::vector::VectorDialect, mlir::math::MathDialect,                    \
-      mlir::complex::ComplexDialect, mlir::DLTIDialect
+      mlir::complex::ComplexDialect, mlir::DLTIDialect, cuf::CUFDialect
 
 #define FLANG_CODEGEN_DIALECT_LIST FIRCodeGenDialect, mlir::LLVM::LLVMDialect
 
diff --git a/flang/include/flang/Optimizer/Support/Utils.h b/flang/include/flang/Optimizer/Support/Utils.h
index 2da6f24da40e..d8bcb5fae034 100644
--- a/flang/include/flang/Optimizer/Support/Utils.h
+++ b/flang/include/flang/Optimizer/Support/Utils.h
@@ -16,6 +16,7 @@
 #include "flang/Common/default-kinds.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/Support/FatalError.h"
@@ -136,66 +137,6 @@ inline void intrinsicTypeTODO(fir::FirOpBuilder &builder, mlir::Type type,
            " in " + intrinsicName);
 }
 
-inline fir::CUDADataAttributeAttr
-getCUDADataAttribute(mlir::MLIRContext *mlirContext,
-                     std::optional<Fortran::common::CUDADataAttr> cudaAttr) {
-  if (cudaAttr) {
-    fir::CUDADataAttribute attr;
-    switch (*cudaAttr) {
-    case Fortran::common::CUDADataAttr::Constant:
-      attr = fir::CUDADataAttribute::Constant;
-      break;
-    case Fortran::common::CUDADataAttr::Device:
-      attr = fir::CUDADataAttribute::Device;
-      break;
-    case Fortran::common::CUDADataAttr::Managed:
-      attr = fir::CUDADataAttribute::Managed;
-      break;
-    case Fortran::common::CUDADataAttr::Pinned:
-      attr = fir::CUDADataAttribute::Pinned;
-      break;
-    case Fortran::common::CUDADataAttr::Shared:
-      attr = fir::CUDADataAttribute::Shared;
-      break;
-    case Fortran::common::CUDADataAttr::Texture:
-      // Obsolete attribute
-      return {};
-    case Fortran::common::CUDADataAttr::Unified:
-      attr = fir::CUDADataAttribute::Unified;
-      break;
-    }
-    return fir::CUDADataAttributeAttr::get(mlirContext, attr);
-  }
-  return {};
-}
-
-inline fir::CUDAProcAttributeAttr getCUDAProcAttribute(
-    mlir::MLIRContext *mlirContext,
-    std::optional<Fortran::common::CUDASubprogramAttrs> cudaAttr) {
-  if (cudaAttr) {
-    fir::CUDAProcAttribute attr;
-    switch (*cudaAttr) {
-    case Fortran::common::CUDASubprogramAttrs::Host:
-      attr = fir::CUDAProcAttribute::Host;
-      break;
-    case Fortran::common::CUDASubprogramAttrs::Device:
-      attr = fir::CUDAProcAttribute::Device;
-      break;
-    case Fortran::common::CUDASubprogramAttrs::HostDevice:
-      attr = fir::CUDAProcAttribute::HostDevice;
-      break;
-    case Fortran::common::CUDASubprogramAttrs::Global:
-      attr = fir::CUDAProcAttribute::Global;
-      break;
-    case Fortran::common::CUDASubprogramAttrs::Grid_Global:
-      attr = fir::CUDAProcAttribute::GridGlobal;
-      break;
-    }
-    return fir::CUDAProcAttributeAttr::get(mlirContext, attr);
-  }
-  return {};
-}
-
 } // namespace fir
 
 #endif // FORTRAN_OPTIMIZER_SUPPORT_UTILS_H
diff --git a/flang/lib/Frontend/CMakeLists.txt b/flang/lib/Frontend/CMakeLists.txt
index a701c264bc4c..f85665d11429 100644
--- a/flang/lib/Frontend/CMakeLists.txt
+++ b/flang/lib/Frontend/CMakeLists.txt
@@ -14,6 +14,7 @@ add_flang_library(flangFrontend
   TextDiagnostic.cpp
 
   DEPENDS
+  CUFDialect
   FIRDialect
   FIROptCodeGenPassIncGen
   FIROptTransformsPassIncGen
@@ -23,6 +24,7 @@ add_flang_library(flangFrontend
   ${extension_libs}
 
   LINK_LIBS
+  CUFDialect
   FortranParser
   FortranSemantics
   FortranEvaluate
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index a1957c0eb1bb..61f4bbd856a8 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -24,6 +24,7 @@
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Support/FatalError.h"
@@ -729,9 +730,9 @@ private:
                               ErrorManager &errorManager,
                               const Fortran::semantics::Symbol &sym) {
     Fortran::lower::StatementContext stmtCtx;
-    fir::CUDADataAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
-                                                         sym);
+    cuf::DataAttributeAttr cudaAttr =
+        Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+                                                        sym);
     mlir::Value errmsg = errMsgExpr ? errorManager.errMsgAddr : nullptr;
     mlir::Value stream =
         streamExpr
@@ -746,7 +747,7 @@ private:
     // Keep return type the same as a standard AllocatableAllocate call.
     mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
     return builder
-        .create<fir::CUDAAllocateOp>(
+        .create<cuf::AllocateOp>(
             loc, retTy, box.getAddr(), errmsg, stream, pinned, source, cudaAttr,
             errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
         .getResult();
@@ -804,9 +805,9 @@ static mlir::Value genCudaDeallocate(fir::FirOpBuilder &builder,
                                      const fir::MutableBoxValue &box,
                                      ErrorManager &errorManager,
                                      const Fortran::semantics::Symbol &sym) {
-  fir::CUDADataAttributeAttr cudaAttr =
-      Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
-                                                       sym);
+  cuf::DataAttributeAttr cudaAttr =
+      Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+                                                      sym);
   mlir::Value errmsg =
       mlir::isa<fir::AbsentOp>(errorManager.errMsgAddr.getDefiningOp())
           ? nullptr
@@ -815,7 +816,7 @@ static mlir::Value genCudaDeallocate(fir::FirOpBuilder &builder,
   // Keep return type the same as a standard AllocatableAllocate call.
   mlir::Type retTy = fir::runtime::getModel<int>()(builder.getContext());
   return builder
-      .create<fir::CUDADeallocateOp>(
+      .create<cuf::DeallocateOp>(
           loc, retTy, box.getAddr(), errmsg, cudaAttr,
           errorManager.hasStatSpec() ? builder.getUnitAttr() : nullptr)
       .getResult();
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index afbc1122de86..4a29c6b8fae7 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -40,6 +40,8 @@
 #include "flang/Optimizer/Builder/Runtime/Ragged.h"
 #include "flang/Optimizer/Builder/Runtime/Stop.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
@@ -2648,8 +2650,8 @@ private:
         loopEval = &*std::next(loopEval->getNestedEvaluations().begin());
     }
 
-    auto op = builder->create<fir::CUDAKernelOp>(
-        loc, gridValues, blockValues, streamValue, lbs, ubs, steps, n);
+    auto op = builder->create<cuf::KernelOp>(loc, gridValues, blockValues,
+                                             streamValue, lbs, ubs, steps, n);
     builder->createBlock(&op.getRegion(), op.getRegion().end(), ivTypes,
                          ivLocs);
     mlir::Block &b = op.getRegion().back();
@@ -3719,46 +3721,44 @@ private:
 
     // device = host
     if (lhsIsDevice && !rhsIsDevice) {
-      auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
-          builder.getContext(), fir::CUDADataTransferKind::HostDevice);
+      auto transferKindAttr = cuf::DataTransferKindAttr::get(
+          builder.getContext(), cuf::DataTransferKind::HostDevice);
       if (!rhs.isVariable()) {
         auto associate = hlfir::genAssociateExpr(
             loc, builder, rhs, rhs.getType(), ".cuf_host_tmp");
-        builder.create<fir::CUDADataTransferOp>(loc, associate.getBase(), lhs,
-                                                transferKindAttr);
+        builder.create<cuf::DataTransferOp>(loc, associate.getBase(), lhs,
+                                            transferKindAttr);
         builder.create<hlfir::EndAssociateOp>(loc, associate);
       } else {
-        builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs,
-                                                transferKindAttr);
+        builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
       }
       return;
     }
 
     // host = device
     if (!lhsIsDevice && rhsIsDevice) {
-      auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
-          builder.getContext(), fir::CUDADataTransferKind::DeviceHost);
+      auto transferKindAttr = cuf::DataTransferKindAttr::get(
+          builder.getContext(), cuf::DataTransferKind::DeviceHost);
       if (!rhs.isVariable()) {
         // evaluateRhs loads scalar. Look for the memory reference to be used in
         // the transfer.
         if (mlir::isa_and_nonnull<fir::LoadOp>(rhs.getDefiningOp())) {
           auto loadOp = mlir::dyn_cast<fir::LoadOp>(rhs.getDefiningOp());
-          builder.create<fir::CUDADataTransferOp>(loc, loadOp.getMemref(), lhs,
-                                                  transferKindAttr);
+          builder.create<cuf::DataTransferOp>(loc, loadOp.getMemref(), lhs,
+                                              transferKindAttr);
           return;
         }
       } else {
-        builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs,
-                                                transferKindAttr);
+        builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
       }
       return;
     }
 
     if (lhsIsDevice && rhsIsDevice) {
       assert(rhs.isVariable() && "CUDA Fortran assignment rhs is not legal");
-      auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
-          builder.getContext(), fir::CUDADataTransferKind::DeviceDevice);
-      builder.create<fir::CUDADataTransferOp>(loc, rhs, lhs, transferKindAttr);
+      auto transferKindAttr = cuf::DataTransferKindAttr::get(
+          builder.getContext(), cuf::DataTransferKind::DeviceDevice);
+      builder.create<cuf::DataTransferOp>(loc, rhs, lhs, transferKindAttr);
       return;
     }
     llvm_unreachable("Unhandled CUDA data transfer");
@@ -3769,8 +3769,8 @@ private:
                               const Fortran::evaluate::Assignment &assign) {
     llvm::SmallVector<mlir::Value> temps;
     localSymbols.pushScope();
-    auto transferKindAttr = fir::CUDADataTransferKindAttr::get(
-        builder.getContext(), fir::CUDADataTransferKind::DeviceHost);
+    auto transferKindAttr = cuf::DataTransferKindAttr::get(
+        builder.getContext(), cuf::DataTransferKind::DeviceHost);
     [[maybe_unused]] unsigned nbDeviceResidentObject = 0;
     for (const Fortran::semantics::Symbol &sym :
          Fortran::evaluate::CollectSymbols(assign.rhs)) {
@@ -3795,8 +3795,8 @@ private:
           addSymbol(sym,
                     hlfir::translateToExtendedValue(loc, builder, temp).first,
                     /*forced=*/true);
-          builder.create<fir::CUDADataTransferOp>(loc, addr, temp,
-                                                  transferKindAttr);
+          builder.create<cuf::DataTransferOp>(loc, addr, temp,
+                                              transferKindAttr);
           ++nbDeviceResidentObject;
         }
       }
@@ -3808,15 +3808,15 @@ private:
   // subprogram are not considered fully device context so it will return false
   // for it.
   static bool isDeviceContext(fir::FirOpBuilder &builder) {
-    if (builder.getRegion().getParentOfType<fir::CUDAKernelOp>())
+    if (builder.getRegion().getParentOfType<cuf::KernelOp>())
       return true;
     if (auto funcOp =
             builder.getRegion().getParentOfType<mlir::func::FuncOp>()) {
       if (auto cudaProcAttr =
-              funcOp.getOperation()->getAttrOfType<fir::CUDAProcAttributeAttr>(
-                  fir::getCUDAAttrName())) {
-        return cudaProcAttr.getValue() != fir::CUDAProcAttribute::Host &&
-               cudaProcAttr.getValue() != fir::CUDAProcAttribute::HostDevice;
+              funcOp.getOperation()->getAttrOfType<cuf::ProcAttributeAttr>(
+                  cuf::getProcAttrName())) {
+        return cudaProcAttr.getValue() != cuf::ProcAttribute::Host &&
+               cudaProcAttr.getValue() != cuf::ProcAttribute::HostDevice;
       }
     }
     return false;
diff --git a/flang/lib/Lower/CMakeLists.txt b/flang/lib/Lower/CMakeLists.txt
index 1546409752e7..ba6622d8504a 100644
--- a/flang/lib/Lower/CMakeLists.txt
+++ b/flang/lib/Lower/CMakeLists.txt
@@ -37,6 +37,8 @@ add_flang_library(FortranLower
   VectorSubscripts.cpp
   
   DEPENDS
+  CUFAttrs
+  CUFDialect
   FIRDialect
   FIRTransforms
   HLFIRDialect
@@ -44,6 +46,8 @@ add_flang_library(FortranLower
   ${extension_libs}
 
   LINK_LIBS
+  CUFAttrs
+  CUFDialect
   FIRDialect
   FIRDialectSupport
   FIRBuilder
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index c1f54ad39287..cfbb7c7f6b4f 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -627,9 +627,9 @@ setCUDAAttributes(mlir::func::FuncOp func,
                       characteristic) {
   if (characteristic && characteristic->cudaSubprogramAttrs) {
     func.getOperation()->setAttr(
-        fir::getCUDAAttrName(),
-        fir::getCUDAProcAttribute(func.getContext(),
-                                  *characteristic->cudaSubprogramAttrs));
+        cuf::getProcAttrName(),
+        cuf::getProcAttribute(func.getContext(),
+                              *characteristic->cudaSubprogramAttrs));
   }
 
   if (sym) {
@@ -649,9 +649,9 @@ setCUDAAttributes(mlir::func::FuncOp func,
           ubAttr =
               mlir::IntegerAttr::get(i64Ty, details->cudaLaunchBounds()[2]);
         func.getOperation()->setAttr(
-            fir::getCUDALaunchBoundsAttrName(),
-            fir::CUDALaunchBoundsAttr::get(func.getContext(), maxTPBAttr,
-                                           minBPMAttr, ubAttr));
+            cuf::getLaunchBoundsAttrName(),
+            cuf::LaunchBoundsAttr::get(func.getContext(), maxTPBAttr,
+                                       minBPMAttr, ubAttr));
       }
 
       if (!details->cudaClusterDims().empty()) {
@@ -663,9 +663,8 @@ setCUDAAttributes(mlir::func::FuncOp func,
         auto zAttr =
             mlir::IntegerAttr::get(i64Ty, details->cudaClusterDims()[2]);
         func.getOperation()->setAttr(
-            fir::getCUDAClusterDimsAttrName(),
-            fir::CUDAClusterDimsAttr::get(func.getContext(), xAttr, yAttr,
-                                          zAttr));
+            cuf::getClusterDimsAttrName(),
+            cuf::ClusterDimsAttr::get(func.getContext(), xAttr, yAttr, zAttr));
       }
     }
   }
@@ -1116,8 +1115,8 @@ private:
       addMLIRAttr(fir::getTargetAttrName());
     if (obj.cudaDataAttr)
       attrs.emplace_back(
-          mlir::StringAttr::get(&mlirContext, fir::getCUDAAttrName()),
-          fir::getCUDADataAttribute(&mlirContext, obj.cudaDataAttr));
+          mlir::StringAttr::get(&mlirContext, cuf::getDataAttrName()),
+          cuf::getDataAttribute(&mlirContext, obj.cudaDataAttr));
 
     // TODO: intents that require special care (e.g finalization)
 
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index c6bfe3592169..7ec719a2cb9e 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -28,6 +28,7 @@
 #include "flang/Optimizer/Builder/MutableBox.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "mlir/IR/IRMapping.h"
@@ -589,7 +590,7 @@ std::pair<fir::ExtendedValue, bool> Fortran::lower::genCallOpAndResult(
           fir::getBase(converter.genExprValue(
               caller.getCallDescription().chevrons()[3], stmtCtx)));
 
-    builder.create<fir::CUDAKernelLaunch>(
+    builder.create<cuf::KernelLaunchOp>(
         loc, funcType.getResults(), funcSymbolAttr, grid_x, grid_y, grid_z,
         block_x, block_y, block_z, bytes, stream, operands);
     callNumResults = 0;
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 5ddd8a6a9d41..b8868161fa05 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -30,6 +30,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
@@ -139,7 +140,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
                                   mlir::StringAttr linkage,
-                                  fir::CUDADataAttributeAttr cudaAttr = {});
+                                  cuf::DataAttributeAttr dataAttr = {});
 
 static mlir::Location genLocation(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::semantics::Symbol &sym) {
@@ -172,12 +173,12 @@ static fir::GlobalOp declareGlobal(Fortran::lower::AbstractConverter &converter,
       !Fortran::semantics::IsProcedurePointer(ultimate))
     mlir::emitError(loc, "processing global declaration: symbol '")
         << toStringRef(sym.name()) << "' has unexpected details\n";
-  fir::CUDADataAttributeAttr cudaAttr =
-      Fortran::lower::translateSymbolCUDADataAttribute(
+  cuf::DataAttributeAttr dataAttr =
+      Fortran::lower::translateSymbolCUFDataAttribute(
           converter.getFirOpBuilder().getContext(), sym);
   return builder.createGlobal(loc, converter.genType(var), globalName, linkage,
                               mlir::Attribute{}, isConstant(ultimate),
-                              var.isTarget(), cudaAttr);
+                              var.isTarget(), dataAttr);
 }
 
 /// Temporary helper to catch todos in initial data target lowering.
@@ -474,7 +475,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
                                   const Fortran::lower::pft::Variable &var,
                                   llvm::StringRef globalName,
                                   mlir::StringAttr linkage,
-                                  fir::CUDADataAttributeAttr cudaAttr) {
+                                  cuf::DataAttributeAttr dataAttr) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   const Fortran::semantics::Symbol &sym = var.getSymbol();
   mlir::Location loc = genLocation(converter, sym);
@@ -514,7 +515,7 @@ static fir::GlobalOp defineGlobal(Fortran::lower::AbstractConverter &converter,
   if (!global)
     global =
         builder.createGlobal(loc, symTy, globalName, linkage, mlir::Attribute{},
-                             isConst, var.isTarget(), cudaAttr);
+                             isConst, var.isTarget(), dataAttr);
   if (Fortran::semantics::IsAllocatableOrPointer(sym) &&
       !Fortran::semantics::IsProcedure(sym)) {
     const auto *details =
@@ -694,9 +695,9 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
     return builder.create<fir::ZeroOp>(loc, fir::ReferenceType::get(ty));
 
   if (Fortran::semantics::NeedCUDAAlloc(ultimateSymbol)) {
-    fir::CUDADataAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
-                                                         ultimateSymbol);
+    cuf::DataAttributeAttr dataAttr =
+        Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+                                                        ultimateSymbol);
     llvm::SmallVector<mlir::Value> indices;
     llvm::SmallVector<mlir::Value> elidedShape =
         fir::factory::elideExtentsAlreadyInType(ty, shape);
@@ -705,8 +706,8 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
     auto idxTy = builder.getIndexType();
     for (mlir::Value sh : elidedShape)
       indices.push_back(builder.createConvert(loc, idxTy, sh));
-    return builder.create<fir::CUDAAllocOp>(loc, ty, nm, symNm, cudaAttr,
-                                            lenParams, indices);
+    return builder.create<cuf::AllocOp>(loc, ty, nm, symNm, dataAttr, lenParams,
+                                        indices);
   }
 
   // Let the builder do all the heavy lifting.
@@ -950,10 +951,10 @@ static void instantiateLocal(Fortran::lower::AbstractConverter &converter,
         converter.getSymbolExtendedValue(var.getSymbol(), &symMap);
     auto *sym = &var.getSymbol();
     converter.getFctCtx().attachCleanup([builder, loc, exv, sym]() {
-      fir::CUDADataAttributeAttr cudaAttr =
-          Fortran::lower::translateSymbolCUDADataAttribute(
-              builder->getContext(), *sym);
-      builder->create<fir::CUDAFreeOp>(loc, fir::getBase(exv), cudaAttr);
+      cuf::DataAttributeAttr dataAttr =
+          Fortran::lower::translateSymbolCUFDataAttribute(builder->getContext(),
+                                                          *sym);
+      builder->create<cuf::FreeOp>(loc, fir::getBase(exv), dataAttr);
     });
   }
 }
@@ -1628,11 +1629,11 @@ fir::FortranVariableFlagsAttr Fortran::lower::translateSymbolAttributes(
   return fir::FortranVariableFlagsAttr::get(mlirContext, flags);
 }
 
-fir::CUDADataAttributeAttr Fortran::lower::translateSymbolCUDADataAttribute(
+cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
     mlir::MLIRContext *mlirContext, const Fortran::semantics::Symbol &sym) {
   std::optional<Fortran::common::CUDADataAttr> cudaAttr =
       Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
-  return fir::getCUDADataAttribute(mlirContext, cudaAttr);
+  return cuf::getDataAttribute(mlirContext, cudaAttr);
 }
 
 /// Map a symbol to its FIR address and evaluated specification expressions.
@@ -1672,9 +1673,9 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
     auto name = converter.mangleName(sym);
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(builder.getContext(), sym);
-    fir::CUDADataAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
-                                                         sym);
+    cuf::DataAttributeAttr dataAttr =
+        Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+                                                        sym);
 
     if (sym.test(Fortran::semantics::Symbol::Flag::CrayPointee)) {
       mlir::Type ptrBoxType =
@@ -1716,7 +1717,7 @@ static void genDeclareSymbol(Fortran::lower::AbstractConverter &converter,
       dummyScope = converter.dummyArgsScopeValue();
     auto newBase = builder.create<hlfir::DeclareOp>(
         loc, base, name, shapeOrShift, lenParams, dummyScope, attributes,
-        cudaAttr);
+        dataAttr);
     symMap.addVariableDefinition(sym, newBase, force);
     return;
   }
@@ -1762,15 +1763,15 @@ void Fortran::lower::genDeclareSymbol(
     fir::FortranVariableFlagsAttr attributes =
         Fortran::lower::translateSymbolAttributes(
             builder.getContext(), sym.GetUltimate(), extraFlags);
-    fir::CUDADataAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDADataAttribute(builder.getContext(),
-                                                         sym.GetUltimate());
+    cuf::DataAttributeAttr dataAttr =
+        Fortran::lower::translateSymbolCUFDataAttribute(builder.getContext(),
+                                                        sym.GetUltimate());
     auto name = converter.mangleName(sym);
     mlir::Value dummyScope;
     if (converter.isRegisteredDummySymbol(sym))
       dummyScope = converter.dummyArgsScopeValue();
     hlfir::EntityWithAttributes declare = hlfir::genDeclare(
-        loc, builder, exv, name, attributes, dummyScope, cudaAttr);
+        loc, builder, exv, name, attributes, dummyScope, dataAttr);
     symMap.addVariableDefinition(sym, declare.getIfVariableInterface(), force);
     return;
   }
@@ -2272,10 +2273,10 @@ void Fortran::lower::defineModuleVariable(
     // Do nothing. Mapping will be done on user side.
   } else {
     std::string globalName = converter.mangleName(sym);
-    fir::CUDADataAttributeAttr cudaAttr =
-        Fortran::lower::translateSymbolCUDADataAttribute(
+    cuf::DataAttributeAttr dataAttr =
+        Fortran::lower::translateSymbolCUFDataAttribute(
             converter.getFirOpBuilder().getContext(), sym);
-    defineGlobal(converter, var, globalName, linkage, cudaAttr);
+    defineGlobal(converter, var, globalName, linkage, dataAttr);
   }
 }
 
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index bd018d7f015b..3c3fd02d7c88 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -322,18 +322,18 @@ mlir::Value fir::FirOpBuilder::createHeapTemporary(
 fir::GlobalOp fir::FirOpBuilder::createGlobal(
     mlir::Location loc, mlir::Type type, llvm::StringRef name,
     mlir::StringAttr linkage, mlir::Attribute value, bool isConst,
-    bool isTarget, fir::CUDADataAttributeAttr cudaAttr) {
+    bool isTarget, cuf::DataAttributeAttr dataAttr) {
   if (auto global = getNamedGlobal(name))
     return global;
   auto module = getModule();
   auto insertPt = saveInsertionPoint();
   setInsertionPoint(module.getBody(), module.getBody()->end());
   llvm::SmallVector<mlir::NamedAttribute> attrs;
-  if (cudaAttr) {
+  if (dataAttr) {
     auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
                                             module.getContext());
     attrs.push_back(mlir::NamedAttribute(
-        fir::GlobalOp::getCudaAttrAttrName(globalOpName), cudaAttr));
+        fir::GlobalOp::getDataAttrAttrName(globalOpName), dataAttr));
   }
   auto glob = create<fir::GlobalOp>(loc, name, isConst, isTarget, type, value,
                                     linkage, attrs);
@@ -346,7 +346,7 @@ fir::GlobalOp fir::FirOpBuilder::createGlobal(
 fir::GlobalOp fir::FirOpBuilder::createGlobal(
     mlir::Location loc, mlir::Type type, llvm::StringRef name, bool isConst,
     bool isTarget, std::function<void(FirOpBuilder &)> bodyBuilder,
-    mlir::StringAttr linkage, fir::CUDADataAttributeAttr cudaAttr) {
+    mlir::StringAttr linkage, cuf::DataAttributeAttr dataAttr) {
   if (auto global = getNamedGlobal(name))
     return global;
   auto module = getModule();
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index 8fdab2a57181..511585dc7689 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -199,7 +199,7 @@ fir::FortranVariableOpInterface
 hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
                   const fir::ExtendedValue &exv, llvm::StringRef name,
                   fir::FortranVariableFlagsAttr flags, mlir::Value dummyScope,
-                  fir::CUDADataAttributeAttr cudaAttr) {
+                  cuf::DataAttributeAttr dataAttr) {
 
   mlir::Value base = fir::getBase(exv);
   assert(fir::conformsWithPassByRef(base.getType()) &&
@@ -229,7 +229,7 @@ hlfir::genDeclare(mlir::Location loc, fir::FirOpBuilder &builder,
       },
       [](const auto &) {});
   auto declareOp = builder.create<hlfir::DeclareOp>(
-      loc, base, name, shapeOrShift, lenParams, dummyScope, flags, cudaAttr);
+      loc, base, name, shapeOrShift, lenParams, dummyScope, flags, dataAttr);
   return mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
 }
 
diff --git a/flang/lib/Optimizer/Dialect/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CMakeLists.txt
index 745439b7e1e5..a8235f841b87 100644
--- a/flang/lib/Optimizer/Dialect/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_subdirectory(Support)
+add_subdirectory(CUF)
 
 add_flang_library(FIRDialect
   FIRAttr.cpp
@@ -13,9 +14,11 @@ add_flang_library(FIRDialect
   CanonicalizationPatternsIncGen
   MLIRIR
   FIROpsIncGen
+  CUFAttrsIncGen
   intrinsics_gen
 
   LINK_LIBS
+  CUFAttrs
   FIRDialectSupport
   MLIRArithDialect
   MLIRBuiltinToLLVMIRTranslation
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
new file mode 100644
index 000000000000..81db40f3ba46
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Keep CUF attributes as a separate library as FIR and HLFIR depend on it. 
+add_flang_library(CUFAttrs
+  CUFAttr.cpp
+
+  DEPENDS
+  MLIRIR
+  CUFAttrsIncGen
+
+  LINK_LIBS
+  MLIRTargetLLVMIRExport
+
+  LINK_COMPONENTS
+  AsmParser
+  AsmPrinter
+  Remarks
+)
diff --git a/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
new file mode 100644
index 000000000000..52c733dcad6f
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp
@@ -0,0 +1,32 @@
+//===-- CUFAttr.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/OpDefinition.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFEnumAttr.cpp.inc"
+#define GET_ATTRDEF_CLASSES
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.cpp.inc"
+
+namespace cuf {
+
+void CUFDialect::registerAttributes() {
+  addAttributes<ClusterDimsAttr, DataAttributeAttr, DataTransferKindAttr,
+                LaunchBoundsAttr, ProcAttributeAttr>();
+}
+
+} // namespace cuf
diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
new file mode 100644
index 000000000000..d5ce5e0a7614
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
@@ -0,0 +1,22 @@
+add_subdirectory(Attributes)
+
+add_flang_library(CUFDialect
+  CUFDialect.cpp
+  CUFOps.cpp
+
+  DEPENDS
+  MLIRIR
+  CUFOpsIncGen
+
+  LINK_LIBS
+  CUFAttrs
+  FIRDialect
+  FIRDialectSupport
+  MLIRIR
+  MLIRTargetLLVMIRExport
+
+  LINK_COMPONENTS
+  AsmParser
+  AsmPrinter
+  Remarks
+)
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFDialect.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFDialect.cpp
new file mode 100644
index 000000000000..47d3636df4f9
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFDialect.cpp
@@ -0,0 +1,25 @@
+//===-- CUFDialect.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/FIRDialect.h"
+
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.cpp.inc"
+
+void cuf::CUFDialect::initialize() {
+  registerAttributes();
+  addOperations<
+#define GET_OP_LIST
+#include "flang/Optimizer/Dialect/CUF/CUFOps.cpp.inc"
+      >();
+}
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
new file mode 100644
index 000000000000..870652c72fab
--- /dev/null
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -0,0 +1,219 @@
+//===-- CUFOps.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coding style: https://mlir.llvm.org/getting_started/DeveloperGuide/
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
+#include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/SmallVector.h"
+
+//===----------------------------------------------------------------------===//
+// AllocOp
+//===----------------------------------------------------------------------===//
+
+static mlir::Type wrapAllocaResultType(mlir::Type intype) {
+  if (mlir::isa<fir::ReferenceType>(intype))
+    return {};
+  return fir::ReferenceType::get(intype);
+}
+
+void cuf::AllocOp::build(mlir::OpBuilder &builder, mlir::OperationState &result,
+                         mlir::Type inType, llvm::StringRef uniqName,
+                         llvm::StringRef bindcName,
+                         cuf::DataAttributeAttr cudaAttr,
+                         mlir::ValueRange typeparams, mlir::ValueRange shape,
+                         llvm::ArrayRef<mlir::NamedAttribute> attributes) {
+  mlir::StringAttr nameAttr =
+      uniqName.empty() ? mlir::StringAttr{} : builder.getStringAttr(uniqName);
+  mlir::StringAttr bindcAttr =
+      bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
+  build(builder, result, wrapAllocaResultType(inType),
+        mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
+        cudaAttr);
+  result.addAttributes(attributes);
+}
+
+template <typename Op>
+static mlir::LogicalResult checkCudaAttr(Op op) {
+  if (op.getDataAttr() == cuf::DataAttribute::Device ||
+      op.getDataAttr() == cuf::DataAttribute::Managed ||
+      op.getDataAttr() == cuf::DataAttribute::Unified)
+    return mlir::success();
+  return op.emitOpError("expect device, managed or unified cuda attribute");
+}
+
+mlir::LogicalResult cuf::AllocOp::verify() { return checkCudaAttr(*this); }
+
+//===----------------------------------------------------------------------===//
+// FreeOp
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult cuf::FreeOp::verify() { return checkCudaAttr(*this); }
+
+//===----------------------------------------------------------------------===//
+// AllocateOp
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult cuf::AllocateOp::verify() {
+  if (getPinned() && getStream())
+    return emitOpError("pinned and stream cannot appears at the same time");
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
+    return emitOpError(
+        "expect box to be a reference to a class or box type value");
+  if (getSource() &&
+      !mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getSource().getType())))
+    return emitOpError(
+        "expect source to be a reference to/or a class or box type value");
+  if (getErrmsg() &&
+      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
+    return emitOpError(
+        "expect errmsg to be a reference to/or a box type value");
+  if (getErrmsg() && !getHasStat())
+    return emitOpError("expect stat attribute when errmsg is provided");
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// DeallocateOp
+//===----------------------------------------------------------------------===//
+
+mlir::LogicalResult cuf::DeallocateOp::verify() {
+  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
+    return emitOpError(
+        "expect box to be a reference to class or box type value");
+  if (getErrmsg() &&
+      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
+    return emitOpError(
+        "expect errmsg to be a reference to/or a box type value");
+  if (getErrmsg() && !getHasStat())
+    return emitOpError("expect stat attribute when errmsg is provided");
+  return mlir::success();
+}
+
+//===----------------------------------------------------------------------===//
+// KernelOp
+//===----------------------------------------------------------------------===//
+
+llvm::SmallVector<mlir::Region *> cuf::KernelOp::getLoopRegions() {
+  return {&getRegion()};
+}
+
+mlir::ParseResult parseCUFKernelValues(
+    mlir::OpAsmParser &parser,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &values,
+    llvm::SmallVectorImpl<mlir::Type> &types) {
+  if (mlir::succeeded(parser.parseOptionalStar()))
+    return mlir::success();
+
+  if (mlir::succeeded(parser.parseOptionalLParen())) {
+    if (mlir::failed(parser.parseCommaSeparatedList(
+            mlir::AsmParser::Delimiter::None, [&]() {
+              if (parser.parseOperand(values.emplace_back()))
+                return mlir::failure();
+              return mlir::success();
+            })))
+      return mlir::failure();
+    auto builder = parser.getBuilder();
+    for (size_t i = 0; i < values.size(); i++) {
+      types.emplace_back(builder.getI32Type());
+    }
+    if (parser.parseRParen())
+      return mlir::failure();
+  } else {
+    if (parser.parseOperand(values.emplace_back()))
+      return mlir::failure();
+    auto builder = parser.getBuilder();
+    types.emplace_back(builder.getI32Type());
+    return mlir::success();
+  }
+  return mlir::success();
+}
+
+void printCUFKernelValues(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                          mlir::ValueRange values, mlir::TypeRange types) {
+  if (values.empty())
+    p << "*";
+
+  if (values.size() > 1)
+    p << "(";
+  llvm::interleaveComma(values, p, [&p](mlir::Value v) { p << v; });
+  if (values.size() > 1)
+    p << ")";
+}
+
+mlir::ParseResult parseCUFKernelLoopControl(
+    mlir::OpAsmParser &parser, mlir::Region &region,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &lowerbound,
+    llvm::SmallVectorImpl<mlir::Type> &lowerboundType,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &upperbound,
+    llvm::SmallVectorImpl<mlir::Type> &upperboundType,
+    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &step,
+    llvm::SmallVectorImpl<mlir::Type> &stepType) {
+
+  llvm::SmallVector<mlir::OpAsmParser::Argument> inductionVars;
+  if (parser.parseLParen() ||
+      parser.parseArgumentList(inductionVars,
+                               mlir::OpAsmParser::Delimiter::None,
+                               /*allowType=*/true) ||
+      parser.parseRParen() || parser.parseEqual() || parser.parseLParen() ||
+      parser.parseOperandList(lowerbound, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(lowerboundType) || parser.parseRParen() ||
+      parser.parseKeyword("to") || parser.parseLParen() ||
+      parser.parseOperandList(upperbound, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(upperboundType) || parser.parseRParen() ||
+      parser.parseKeyword("step") || parser.parseLParen() ||
+      parser.parseOperandList(step, inductionVars.size(),
+                              mlir::OpAsmParser::Delimiter::None) ||
+      parser.parseColonTypeList(stepType) || parser.parseRParen())
+    return mlir::failure();
+  return parser.parseRegion(region, inductionVars);
+}
+
+void printCUFKernelLoopControl(
+    mlir::OpAsmPrinter &p, mlir::Operation *op, mlir::Region &region,
+    mlir::ValueRange lowerbound, mlir::TypeRange lowerboundType,
+    mlir::ValueRange upperbound, mlir::TypeRange upperboundType,
+    mlir::ValueRange steps, mlir::TypeRange stepType) {
+  mlir::ValueRange regionArgs = region.front().getArguments();
+  if (!regionArgs.empty()) {
+    p << "(";
+    llvm::interleaveComma(
+        regionArgs, p, [&p](mlir::Value v) { p << v << " : " << v.getType(); });
+    p << ") = (" << lowerbound << " : " << lowerboundType << ") to ("
+      << upperbound << " : " << upperboundType << ") "
+      << " step (" << steps << " : " << stepType << ") ";
+  }
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
+mlir::LogicalResult cuf::KernelOp::verify() {
+  if (getLowerbound().size() != getUpperbound().size() ||
+      getLowerbound().size() != getStep().size())
+    return emitOpError(
+        "expect same number of values in lowerbound, upperbound and step");
+
+  return mlir::success();
+}
+
+// Tablegen operators
+
+#define GET_OP_CLASSES
+#include "flang/Optimizer/Dialect/CUF/CUFOps.cpp.inc"
diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
index 9ea3a0568f69..2faba63dfba0 100644
--- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp
@@ -298,7 +298,5 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr,
 void FIROpsDialect::registerAttributes() {
   addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr,
                 LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr,
-                UpperBoundAttr, CUDADataAttributeAttr, CUDAProcAttributeAttr,
-                CUDALaunchBoundsAttr, CUDAClusterDimsAttr,
-                CUDADataTransferKindAttr>();
+                UpperBoundAttr>();
 }
diff --git a/flang/lib/Optimizer/Dialect/FIRDialect.cpp b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
index 4d1e8cd1405a..4b1dadaac672 100644
--- a/flang/lib/Optimizer/Dialect/FIRDialect.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRDialect.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Dialect/FIRDialect.h"
+#include "flang/Optimizer/Dialect/CUF/Attributes/CUFAttr.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index edf7f7f4b1a9..94113da9a46c 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -3898,169 +3898,6 @@ mlir::LogicalResult fir::DeclareOp::verify() {
   return fortranVar.verifyDeclareLikeOpImpl(getMemref());
 }
 
-llvm::SmallVector<mlir::Region *> fir::CUDAKernelOp::getLoopRegions() {
-  return {&getRegion()};
-}
-
-mlir::ParseResult parseCUFKernelValues(
-    mlir::OpAsmParser &parser,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &values,
-    llvm::SmallVectorImpl<mlir::Type> &types) {
-  if (mlir::succeeded(parser.parseOptionalStar()))
-    return mlir::success();
-
-  if (mlir::succeeded(parser.parseOptionalLParen())) {
-    if (mlir::failed(parser.parseCommaSeparatedList(
-            mlir::AsmParser::Delimiter::None, [&]() {
-              if (parser.parseOperand(values.emplace_back()))
-                return mlir::failure();
-              return mlir::success();
-            })))
-      return mlir::failure();
-    auto builder = parser.getBuilder();
-    for (size_t i = 0; i < values.size(); i++) {
-      types.emplace_back(builder.getI32Type());
-    }
-    if (parser.parseRParen())
-      return mlir::failure();
-  } else {
-    if (parser.parseOperand(values.emplace_back()))
-      return mlir::failure();
-    auto builder = parser.getBuilder();
-    types.emplace_back(builder.getI32Type());
-    return mlir::success();
-  }
-  return mlir::success();
-}
-
-void printCUFKernelValues(mlir::OpAsmPrinter &p, mlir::Operation *op,
-                          mlir::ValueRange values, mlir::TypeRange types) {
-  if (values.empty())
-    p << "*";
-
-  if (values.size() > 1)
-    p << "(";
-  llvm::interleaveComma(values, p, [&p](mlir::Value v) { p << v; });
-  if (values.size() > 1)
-    p << ")";
-}
-
-mlir::ParseResult parseCUFKernelLoopControl(
-    mlir::OpAsmParser &parser, mlir::Region &region,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &lowerbound,
-    llvm::SmallVectorImpl<mlir::Type> &lowerboundType,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &upperbound,
-    llvm::SmallVectorImpl<mlir::Type> &upperboundType,
-    llvm::SmallVectorImpl<mlir::OpAsmParser::UnresolvedOperand> &step,
-    llvm::SmallVectorImpl<mlir::Type> &stepType) {
-
-  llvm::SmallVector<mlir::OpAsmParser::Argument> inductionVars;
-  if (parser.parseLParen() ||
-      parser.parseArgumentList(inductionVars,
-                               mlir::OpAsmParser::Delimiter::None,
-                               /*allowType=*/true) ||
-      parser.parseRParen() || parser.parseEqual() || parser.parseLParen() ||
-      parser.parseOperandList(lowerbound, inductionVars.size(),
-                              mlir::OpAsmParser::Delimiter::None) ||
-      parser.parseColonTypeList(lowerboundType) || parser.parseRParen() ||
-      parser.parseKeyword("to") || parser.parseLParen() ||
-      parser.parseOperandList(upperbound, inductionVars.size(),
-                              mlir::OpAsmParser::Delimiter::None) ||
-      parser.parseColonTypeList(upperboundType) || parser.parseRParen() ||
-      parser.parseKeyword("step") || parser.parseLParen() ||
-      parser.parseOperandList(step, inductionVars.size(),
-                              mlir::OpAsmParser::Delimiter::None) ||
-      parser.parseColonTypeList(stepType) || parser.parseRParen())
-    return mlir::failure();
-  return parser.parseRegion(region, inductionVars);
-}
-
-void printCUFKernelLoopControl(
-    mlir::OpAsmPrinter &p, mlir::Operation *op, mlir::Region &region,
-    mlir::ValueRange lowerbound, mlir::TypeRange lowerboundType,
-    mlir::ValueRange upperbound, mlir::TypeRange upperboundType,
-    mlir::ValueRange steps, mlir::TypeRange stepType) {
-  mlir::ValueRange regionArgs = region.front().getArguments();
-  if (!regionArgs.empty()) {
-    p << "(";
-    llvm::interleaveComma(
-        regionArgs, p, [&p](mlir::Value v) { p << v << " : " << v.getType(); });
-    p << ") = (" << lowerbound << " : " << lowerboundType << ") to ("
-      << upperbound << " : " << upperboundType << ") "
-      << " step (" << steps << " : " << stepType << ") ";
-  }
-  p.printRegion(region, /*printEntryBlockArgs=*/false);
-}
-
-mlir::LogicalResult fir::CUDAKernelOp::verify() {
-  if (getLowerbound().size() != getUpperbound().size() ||
-      getLowerbound().size() != getStep().size())
-    return emitOpError(
-        "expect same number of values in lowerbound, upperbound and step");
-
-  return mlir::success();
-}
-
-mlir::LogicalResult fir::CUDAAllocateOp::verify() {
-  if (getPinned() && getStream())
-    return emitOpError("pinned and stream cannot appears at the same time");
-  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
-    return emitOpError(
-        "expect box to be a reference to a class or box type value");
-  if (getSource() &&
-      !mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getSource().getType())))
-    return emitOpError(
-        "expect source to be a reference to/or a class or box type value");
-  if (getErrmsg() &&
-      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
-    return emitOpError(
-        "expect errmsg to be a reference to/or a box type value");
-  if (getErrmsg() && !getHasStat())
-    return emitOpError("expect stat attribute when errmsg is provided");
-  return mlir::success();
-}
-
-mlir::LogicalResult fir::CUDADeallocateOp::verify() {
-  if (!mlir::isa<fir::BaseBoxType>(fir::unwrapRefType(getBox().getType())))
-    return emitOpError(
-        "expect box to be a reference to class or box type value");
-  if (getErrmsg() &&
-      !mlir::isa<fir::BoxType>(fir::unwrapRefType(getErrmsg().getType())))
-    return emitOpError(
-        "expect errmsg to be a reference to/or a box type value");
-  if (getErrmsg() && !getHasStat())
-    return emitOpError("expect stat attribute when errmsg is provided");
-  return mlir::success();
-}
-
-void fir::CUDAAllocOp::build(
-    mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Type inType,
-    llvm::StringRef uniqName, llvm::StringRef bindcName,
-    fir::CUDADataAttributeAttr cudaAttr, mlir::ValueRange typeparams,
-    mlir::ValueRange shape, llvm::ArrayRef<mlir::NamedAttribute> attributes) {
-  mlir::StringAttr nameAttr =
-      uniqName.empty() ? mlir::StringAttr{} : builder.getStringAttr(uniqName);
-  mlir::StringAttr bindcAttr =
-      bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
-  build(builder, result, wrapAllocaResultType(inType),
-        mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
-        cudaAttr);
-  result.addAttributes(attributes);
-}
-
-template <typename Op>
-static mlir::LogicalResult checkCudaAttr(Op op) {
-  if (op.getCudaAttr() == fir::CUDADataAttribute::Device ||
-      op.getCudaAttr() == fir::CUDADataAttribute::Managed ||
-      op.getCudaAttr() == fir::CUDADataAttribute::Unified)
-    return mlir::success();
-  return op.emitOpError("expect device, managed or unified cuda attribute");
-}
-
-mlir::LogicalResult fir::CUDAAllocOp::verify() { return checkCudaAttr(*this); }
-
-mlir::LogicalResult fir::CUDAFreeOp::verify() { return checkCudaAttr(*this); }
-
 //===----------------------------------------------------------------------===//
 // FIROpsDialect
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
index dc9e080b0f8b..267d6469ee7a 100644
--- a/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/IR/CMakeLists.txt
@@ -5,11 +5,13 @@ add_flang_library(HLFIRDialect
   HLFIROps.cpp
 
   DEPENDS
+  CUFAttrsIncGen
   FIRDialect
   HLFIROpsIncGen
   ${dialect_libs}
 
   LINK_LIBS
+  CUFAttrs
   FIRDialect
   MLIRIR
   ${dialect_libs}
diff --git a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
index c232ae165d4c..11196353b07c 100644
--- a/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
+++ b/flang/lib/Optimizer/HLFIR/IR/HLFIROps.cpp
@@ -127,14 +127,14 @@ void hlfir::DeclareOp::build(mlir::OpBuilder &builder,
                              mlir::ValueRange typeparams,
                              mlir::Value dummy_scope,
                              fir::FortranVariableFlagsAttr fortran_attrs,
-                             fir::CUDADataAttributeAttr cuda_attr) {
+                             cuf::DataAttributeAttr data_attr) {
   auto nameAttr = builder.getStringAttr(uniq_name);
   mlir::Type inputType = memref.getType();
   bool hasExplicitLbs = hasExplicitLowerBounds(shape);
   mlir::Type hlfirVariableType =
       getHLFIRVariableType(inputType, hasExplicitLbs);
   build(builder, result, {hlfirVariableType, inputType}, memref, shape,
-        typeparams, dummy_scope, nameAttr, fortran_attrs, cuda_attr);
+        typeparams, dummy_scope, nameAttr, fortran_attrs, data_attr);
 }
 
 mlir::LogicalResult hlfir::DeclareOp::verify() {
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
index ad569ce3b41f..fa3a59303137 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/HLFIR/Transforms/CMakeLists.txt
@@ -11,11 +11,13 @@ add_flang_library(HLFIRTransforms
   OptimizedBufferization.cpp
 
   DEPENDS
+  CUFAttrsIncGen
   FIRDialect
   HLFIROpsIncGen
   ${dialect_libs}
 
   LINK_LIBS
+  CUFAttrs
   FIRAnalysis
   FIRDialect
   FIRBuilder
diff --git a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
index 3570e0011ca7..e56595d1c8e2 100644
--- a/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
+++ b/flang/lib/Optimizer/HLFIR/Transforms/ConvertToFIR.cpp
@@ -320,16 +320,16 @@ public:
     mlir::Location loc = declareOp->getLoc();
     mlir::Value memref = declareOp.getMemref();
     fir::FortranVariableFlagsAttr fortranAttrs;
-    fir::CUDADataAttributeAttr cudaAttr;
+    cuf::DataAttributeAttr dataAttr;
     if (auto attrs = declareOp.getFortranAttrs())
       fortranAttrs =
           fir::FortranVariableFlagsAttr::get(rewriter.getContext(), *attrs);
-    if (auto attr = declareOp.getCudaAttr())
-      cudaAttr = fir::CUDADataAttributeAttr::get(rewriter.getContext(), *attr);
+    if (auto attr = declareOp.getDataAttr())
+      dataAttr = cuf::DataAttributeAttr::get(rewriter.getContext(), *attr);
     auto firDeclareOp = rewriter.create<fir::DeclareOp>(
         loc, memref.getType(), memref, declareOp.getShape(),
         declareOp.getTypeparams(), declareOp.getDummyScope(),
-        declareOp.getUniqName(), fortranAttrs, cudaAttr);
+        declareOp.getUniqName(), fortranAttrs, dataAttr);
 
     // Propagate other attributes from hlfir.declare to fir.declare.
     // OpenACC's acc.declare is one example. Right now, the propagation
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
index 5a12e3c1a4bf..6e18e48ac82f 100644
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -4,11 +4,11 @@ func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
   %1 = fir.alloca i32
   %pinned = fir.alloca i1
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %s = fir.load %1 : !fir.ref<i32>
-  // expected-error@+1{{'fir.cuda_allocate' op pinned and stream cannot appears at the same time}}
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) pinned(%pinned : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+  // expected-error@+1{{'cuf.allocate' op pinned and stream cannot appears at the same time}}
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) pinned(%pinned : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
@@ -16,8 +16,8 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %1 = fir.alloca i32
-  // expected-error@+1{{'fir.cuda_allocate' op expect box to be a reference to a class or box type value}}
-  %2 = fir.cuda_allocate %1 : !fir.ref<i32> {cuda_attr = #fir.cuda<device>} -> i32
+  // expected-error@+1{{'cuf.allocate' op expect box to be a reference to a class or box type value}}
+  %2 = cuf.allocate %1 : !fir.ref<i32> {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
@@ -25,15 +25,15 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %c100 = arith.constant 100 : index
   %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
   %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
   %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
-  // expected-error@+1{{'fir.cuda_allocate' op expect stat attribute when errmsg is provided}}
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>} -> i32
+  // expected-error@+1{{'cuf.allocate' op expect stat attribute when errmsg is provided}}
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
@@ -41,11 +41,11 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %1 = fir.alloca i32
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  // expected-error@+1{{'fir.cuda_allocate' op expect errmsg to be a reference to/or a box type value}}
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  // expected-error@+1{{'cuf.allocate' op expect errmsg to be a reference to/or a box type value}}
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
   return
 }
 
@@ -53,8 +53,8 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %1 = fir.alloca i32
-  // expected-error@+1{{'fir.cuda_deallocate' op expect box to be a reference to class or box type value}}
-  %2 = fir.cuda_deallocate %1 : !fir.ref<i32> {cuda_attr = #fir.cuda<device>} -> i32
+  // expected-error@+1{{'cuf.deallocate' op expect box to be a reference to class or box type value}}
+  %2 = cuf.deallocate %1 : !fir.ref<i32> {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
@@ -62,11 +62,11 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %1 = fir.alloca i32
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  // expected-error@+1{{'fir.cuda_deallocate' op expect errmsg to be a reference to/or a box type value}}
-  %13 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  // expected-error@+1{{'cuf.deallocate' op expect errmsg to be a reference to/or a box type value}}
+  %13 = cuf.deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%1 : !fir.ref<i32>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
   return
 }
 
@@ -74,32 +74,32 @@ func.func @_QPsub1() {
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %c100 = arith.constant 100 : index
   %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
   %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
   %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
-  // expected-error@+1{{'fir.cuda_deallocate' op expect stat attribute when errmsg is provided}}
-  %13 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>} -> i32
+  // expected-error@+1{{'cuf.deallocate' op expect stat attribute when errmsg is provided}}
+  %13 = cuf.deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
 // -----
 
 func.func @_QPsub1() {
-  // expected-error@+1{{'fir.cuda_alloc' op expect device, managed or unified cuda attribute}}
-  %0 = fir.cuda_alloc f32 {bindc_name = "r", cuda_attr = #fir.cuda<pinned>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
-  fir.cuda_free %0 : !fir.ref<f32> {cuda_attr = #fir.cuda<constant>}
+  // expected-error@+1{{'cuf.alloc' op expect device, managed or unified cuda attribute}}
+  %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda<pinned>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
+  cuf.free %0 : !fir.ref<f32> {data_attr = #cuf.cuda<constant>}
   return
 }
 
 // -----
 
 func.func @_QPsub1() {
-  %0 = fir.cuda_alloc f32 {bindc_name = "r", cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
-  // expected-error@+1{{'fir.cuda_free' op expect device, managed or unified cuda attribute}}
-  fir.cuda_free %0 : !fir.ref<f32> {cuda_attr = #fir.cuda<constant>}
+  %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
+  // expected-error@+1{{'cuf.free' op expect device, managed or unified cuda attribute}}
+  cuf.free %0 : !fir.ref<f32> {data_attr = #cuf.cuda<constant>}
   return
 }
diff --git a/flang/test/Fir/cuf.mlir b/flang/test/Fir/cuf.mlir
index 8e2346def43e..188044d04b84 100644
--- a/flang/test/Fir/cuf.mlir
+++ b/flang/test/Fir/cuf.mlir
@@ -4,85 +4,85 @@
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
-  %14 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> {data_attr = #cuf.cuda<device>} -> i32
+  %14 = cuf.deallocate %11 : !fir.ref<!fir.box<none>> {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
-// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
-// CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref<!fir.box<none>> {cuda_attr = #fir.cuda<device>} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> {data_attr = #cuf.cuda<device>} -> i32
+// CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<none>> {data_attr = #cuf.cuda<device>} -> i32
 
 // -----
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
   %1 = fir.alloca i32
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %s = fir.load %1 : !fir.ref<i32>
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) {cuda_attr = #fir.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> stream(%s : i32) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
-// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : i32) {cuda_attr = #fir.cuda<device>} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> stream(%{{.*}} : i32) {data_attr = #cuf.cuda<device>} -> i32
 
 // -----
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
   %1 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub1Eb"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %5:2 = hlfir.declare %1 {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %12 = fir.convert %5#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> source(%12 : !fir.ref<!fir.box<none>>) {cuda_attr = #fir.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> source(%12 : !fir.ref<!fir.box<none>>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
-// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> source(%{{.*}} : !fir.ref<!fir.box<none>>) {cuda_attr = #fir.cuda<device>} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> source(%{{.*}} : !fir.ref<!fir.box<none>>) {data_attr = #cuf.cuda<device>} -> i32
 
 // -----
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
   %pinned = fir.alloca i1
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> pinned(%pinned : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> pinned(%pinned : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
   return
 }
 
-// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> pinned(%{{.*}} : !fir.ref<i1>) {cuda_attr = #fir.cuda<device>} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> pinned(%{{.*}} : !fir.ref<i1>) {data_attr = #cuf.cuda<device>} -> i32
 
 // -----
 
 func.func @_QPsub1() {
   %0 = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-  %4:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+  %4:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
   %c100 = arith.constant 100 : index
   %7 = fir.alloca !fir.char<1,100> {bindc_name = "msg", uniq_name = "_QFsub1Emsg"}
   %8:2 = hlfir.declare %7 typeparams %c100 {uniq_name = "_QFsub1Emsg"} : (!fir.ref<!fir.char<1,100>>, index) -> (!fir.ref<!fir.char<1,100>>, !fir.ref<!fir.char<1,100>>)
   %9 = fir.embox %8#1 : (!fir.ref<!fir.char<1,100>>) -> !fir.box<!fir.char<1,100>>
   %11 = fir.convert %4#1 : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> !fir.ref<!fir.box<none>>
   %16 = fir.convert %9 : (!fir.box<!fir.char<1,100>>) -> !fir.box<none>
-  %13 = fir.cuda_allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
-  %14 = fir.cuda_deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+  %13 = cuf.allocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
+  %14 = cuf.deallocate %11 : !fir.ref<!fir.box<none>> errmsg(%16 : !fir.box<none>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
   return
 }
 
-// CHECK: fir.cuda_allocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
-// CHECK: fir.cuda_deallocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+// CHECK: cuf.allocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
+// CHECK: cuf.deallocate %{{.*}} : !fir.ref<!fir.box<none>> errmsg(%{{.*}} : !fir.box<none>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
 
 // -----
 
 func.func @_QPsub1() {
-  %0 = fir.cuda_alloc f32 {bindc_name = "r", cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
-  fir.cuda_free %0 : !fir.ref<f32> {cuda_attr = #fir.cuda<device>}
+  %0 = cuf.alloc f32 {bindc_name = "r", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Er"} -> !fir.ref<f32>
+  cuf.free %0 : !fir.ref<f32> {data_attr = #cuf.cuda<device>}
   return
 }
 
-// CHECK: fir.cuda_alloc
-// CHECK: fir.cuda_free
+// CHECK: cuf.alloc
+// CHECK: cuf.free
 
diff --git a/flang/test/Lower/CUDA/cuda-allocatable.cuf b/flang/test/Lower/CUDA/cuda-allocatable.cuf
index eff5f13669e9..74a3ec100a8f 100644
--- a/flang/test/Lower/CUDA/cuda-allocatable.cuf
+++ b/flang/test/Lower/CUDA/cuda-allocatable.cuf
@@ -11,11 +11,11 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub1()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub1Ea"}
-! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub1Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 
-! CHECK: %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 
 ! CHECK: %[[BOX_LOAD:.*]] = fir.load %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[BOX_LOAD]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
@@ -23,7 +23,7 @@ end subroutine
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i64
 ! CHECK: %[[NE_C0:.*]] = arith.cmpi ne, %[[ADDR_I64]], %[[C0]] : i64
 ! CHECK: fir.if %[[NE_C0]] {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
 
 subroutine sub2()
@@ -36,18 +36,18 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub2()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub2Ea"}
-! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub2Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub2Eistat"}
 ! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub2Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
+! CHECK: %[[STAT:.*]] = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasStat} -> i32
 ! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
 
-! CHECK: %[[STAT:.*]] = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>, hasStat} -> i32
+! CHECK: %[[STAT:.*]] = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>, hasStat} -> i32
 ! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
 
 ! CHECK: fir.if %{{.*}} {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<managed>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<managed>} -> i32
 ! CHECK: }
 
 subroutine sub3()
@@ -58,13 +58,13 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub3()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?x?xi32>>> {bindc_name = "a", uniq_name = "_QFsub3Ea"}
-! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub3Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>>)
 ! CHECK: %[[PLOG:.*]] = fir.alloca !fir.logical<4> {bindc_name = "plog", uniq_name = "_QFsub3Eplog"}
 ! CHECK: %[[PLOG_DECL:.*]]:2 = hlfir.declare %5 {uniq_name = "_QFsub3Eplog"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
 ! CHECK-2: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {cuda_attr = #fir.cuda<pinned>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> pinned(%[[PLOG_DECL]]#1 : !fir.ref<!fir.logical<4>>) {data_attr = #cuf.cuda<pinned>} -> i32
 ! CHECK: fir.if %{{.*}} {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {cuda_attr = #fir.cuda<pinned>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x?xi32>>>> {data_attr = #cuf.cuda<pinned>} -> i32
 ! CHECK: }
 
 subroutine sub4()
@@ -75,14 +75,14 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub4()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub4Ea"}
-! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %0 {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub4Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[ISTREAM:.*]] = fir.alloca i32 {bindc_name = "istream", uniq_name = "_QFsub4Eistream"}
 ! CHECK: %[[ISTREAM_DECL:.*]]:2 = hlfir.declare %[[ISTREAM]] {uniq_name = "_QFsub4Eistream"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
 ! CHECK: %[[STREAM:.*]] = fir.load %[[ISTREAM_DECL]]#0 : !fir.ref<i32>
-! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> stream(%[[STREAM]] : i32) {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: fir.if %{{.*}} {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
 
 subroutine sub5()
@@ -93,16 +93,16 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub5()
 ! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub5Ea"}
-! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub5Eb"}
 ! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub5Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> source(%[[LOAD_B]] : !fir.box<!fir.heap<!fir.array<?xf32>>>) {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: fir.if
 ! CHECK: fir.freemem
 ! CHECK: fir.if %{{.*}} {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
 
 subroutine sub6()
@@ -113,14 +113,14 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub6()
 ! CHECK: %[[BOX_A:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub6Ea"}
-! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_A_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[BOX_B:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "b", uniq_name = "_QFsub6Eb"}
 ! CHECK: %[[BOX_B_DECL:.*]]:2 = hlfir.declare %[[BOX_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub6Eb"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[LOAD_B:.*]] = fir.load %[[BOX_B_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 ! CHECK: fir.call @_FortranAAllocatableApplyMold
-! CHECK: %{{.*}} = fir.cuda_allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK: %{{.*}} = cuf.allocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: fir.if %{{.*}} {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_A_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
 
 subroutine sub7()
@@ -134,19 +134,19 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub7()
 ! CHECK: %[[BOX:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xf32>>> {bindc_name = "a", uniq_name = "_QFsub7Ea"}
-! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {cuda_attr = #fir.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+! CHECK: %[[BOX_DECL:.*]]:2 = hlfir.declare %[[BOX]] {data_attr = #cuf.cuda<device>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFsub7Ea"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
 ! CHECK: %[[ERR:.*]] = fir.alloca !fir.char<1,50> {bindc_name = "err", uniq_name = "_QFsub7Eerr"}
 ! CHECK: %[[ERR_DECL:.*]]:2 = hlfir.declare %[[ERR]] typeparams %{{.*}} {uniq_name = "_QFsub7Eerr"} : (!fir.ref<!fir.char<1,50>>, index) -> (!fir.ref<!fir.char<1,50>>, !fir.ref<!fir.char<1,50>>)
 ! CHECK: %[[ISTAT:.*]] = fir.alloca i32 {bindc_name = "istat", uniq_name = "_QFsub7Eistat"}
 ! CHECK: %[[ISTAT_DECL:.*]]:2 = hlfir.declare %[[ISTAT]] {uniq_name = "_QFsub7Eistat"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 ! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
 ! CHECK: fir.call @_FortranAAllocatableSetBounds
-! CHECK: %[[STAT:.*]] = fir.cuda_allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+! CHECK: %[[STAT:.*]] = cuf.allocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%[[ERR_BOX]] : !fir.box<!fir.char<1,50>>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
 ! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
 
 ! CHECK: %[[ERR_BOX:.*]] = fir.embox %[[ERR_DECL]]#1 : (!fir.ref<!fir.char<1,50>>) -> !fir.box<!fir.char<1,50>>
-! CHECK: %[[STAT:.*]] = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%15 : !fir.box<!fir.char<1,50>>) {cuda_attr = #fir.cuda<device>, hasStat} -> i32
+! CHECK: %[[STAT:.*]] = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> errmsg(%15 : !fir.box<!fir.char<1,50>>) {data_attr = #cuf.cuda<device>, hasStat} -> i32
 ! CHECK: fir.store %[[STAT]] to %[[ISTAT_DECL]]#1 : !fir.ref<i32>
 ! CHECK: fir.if %{{.*}} {
-! CHECK:   %{{.*}} = fir.cuda_deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {cuda_attr = #fir.cuda<device>} -> i32
+! CHECK:   %{{.*}} = cuf.deallocate %[[BOX_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> {data_attr = #cuf.cuda<device>} -> i32
 ! CHECK: }
diff --git a/flang/test/Lower/CUDA/cuda-data-attribute.cuf b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
index 3eb42a6a5d40..f7f58a43a143 100644
--- a/flang/test/Lower/CUDA/cuda-data-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-attribute.cuf
@@ -5,13 +5,13 @@
 
 module cuda_var
   real, constant :: mod_a_rc
-! CHECK: fir.global @_QMcuda_varEmod_a_rc {cuda_attr = #fir.cuda<constant>} : f32 
+! CHECK: fir.global @_QMcuda_varEmod_a_rc {data_attr = #cuf.cuda<constant>} : f32 
   real, device :: mod_b_ra
-! CHECK: fir.global @_QMcuda_varEmod_b_ra {cuda_attr = #fir.cuda<device>} : f32
+! CHECK: fir.global @_QMcuda_varEmod_b_ra {data_attr = #cuf.cuda<device>} : f32
   real, allocatable, managed :: mod_c_rm
-! CHECK: fir.global @_QMcuda_varEmod_c_rm {cuda_attr = #fir.cuda<managed>} : !fir.box<!fir.heap<f32>>
+! CHECK: fir.global @_QMcuda_varEmod_c_rm {data_attr = #cuf.cuda<managed>} : !fir.box<!fir.heap<f32>>
   real, allocatable, pinned :: mod_d_rp
-! CHECK: fir.global @_QMcuda_varEmod_d_rp {cuda_attr = #fir.cuda<pinned>} : !fir.box<!fir.heap<f32>>
+! CHECK: fir.global @_QMcuda_varEmod_d_rp {data_attr = #cuf.cuda<pinned>} : !fir.box<!fir.heap<f32>>
 
 contains
 
@@ -23,44 +23,44 @@ subroutine local_var_attrs
 end subroutine
 
 ! CHECK-LABEL: func.func @_QMcuda_varPlocal_var_attrs()
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
-! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFlocal_var_attrsEru"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK: %{{.*}}:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFlocal_var_attrsEru"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
-! FIR: %{{.*}} = fir.declare %{{.*}} {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFlocal_var_attrsEru"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFlocal_var_attrsErd"} : (!fir.ref<f32>) -> !fir.ref<f32>
+! FIR: %{{.*}} = fir.declare %{{.*}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIR: %{{.*}} = fir.declare %{{.*}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFlocal_var_attrsErp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+! FIR: %{{.*}} = fir.declare %{{.*}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFlocal_var_attrsEru"} : (!fir.ref<f32>) -> !fir.ref<f32>
 
 subroutine dummy_arg_device(dd)
   real, device :: dd
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_device(
-! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "dd", fir.cuda_attr = #fir.cuda<device>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "dd"}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFdummy_arg_deviceEdd"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine dummy_arg_managed(dm)
   real, allocatable, managed :: dm
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_managed(
-! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dm", fir.cuda_attr = #fir.cuda<managed>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {cuf.data_attr = #cuf.cuda<managed>, fir.bindc_name = "dm"}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<managed>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_managedEdm"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_pinned(dp)
   real, allocatable, pinned :: dp
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_pinned(
-! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "dp", fir.cuda_attr = #fir.cuda<pinned>}) {
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.box<!fir.heap<f32>>> {cuf.data_attr = #cuf.cuda<pinned>, fir.bindc_name = "dp"}) {
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<pinned>, fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMcuda_varFdummy_arg_pinnedEdp"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
 subroutine dummy_arg_unified(du)
   real, unified :: du
 end subroutine
 ! CHECK-LABEL: func.func @_QMcuda_varPdummy_arg_unified(
-! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {fir.bindc_name = "du", fir.cuda_attr = #fir.cuda<unified>})
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<f32> {cuf.data_attr = #cuf.cuda<unified>, fir.bindc_name = "du"})
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ARG0]] dummy_scope %{{[0-9]+}} {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFdummy_arg_unifiedEdu"} : (!fir.ref<f32>, !fir.dscope) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine cuda_alloc_free(n)
   integer :: n
@@ -70,27 +70,27 @@ subroutine cuda_alloc_free(n)
 end
 
 ! CHECK-LABEL: func.func @_QMcuda_varPcuda_alloc_free
-! CHECK: %[[ALLOC_A:.*]] = fir.cuda_alloc !fir.array<10xf32> {bindc_name = "a", cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
+! CHECK: %[[ALLOC_A:.*]] = cuf.alloc !fir.array<10xf32> {bindc_name = "a", data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} -> !fir.ref<!fir.array<10xf32>>
 ! CHECK: %[[SHAPE:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
-! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ALLOC_A]](%[[SHAPE]]) {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ALLOC_A]](%[[SHAPE]]) {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuda_varFcuda_alloc_freeEa"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
 
-! CHECK: %[[ALLOC_U:.*]] = fir.cuda_alloc i32 {bindc_name = "u", cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFcuda_alloc_freeEu"} -> !fir.ref<i32>
-! CHECK: %[[DECL_U:.*]]:2 = hlfir.declare %[[ALLOC_U]] {cuda_attr = #fir.cuda<unified>, uniq_name = "_QMcuda_varFcuda_alloc_freeEu"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[ALLOC_U:.*]] = cuf.alloc i32 {bindc_name = "u", data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFcuda_alloc_freeEu"} -> !fir.ref<i32>
+! CHECK: %[[DECL_U:.*]]:2 = hlfir.declare %[[ALLOC_U]] {data_attr = #cuf.cuda<unified>, uniq_name = "_QMcuda_varFcuda_alloc_freeEu"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
-! CHECK: %[[ALLOC_B:.*]] = fir.cuda_alloc !fir.array<?xf32>, %{{.*}} : index {bindc_name = "b", cuda_attr = #fir.cuda<managed>, uniq_name = "_QMcuda_varFcuda_alloc_freeEb"} -> !fir.ref<!fir.array<?xf32>>
+! CHECK: %[[ALLOC_B:.*]] = cuf.alloc !fir.array<?xf32>, %{{.*}} : index {bindc_name = "b", data_attr = #cuf.cuda<managed>, uniq_name = "_QMcuda_varFcuda_alloc_freeEb"} -> !fir.ref<!fir.array<?xf32>>
 ! CHECK: %[[SHAPE:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
-! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ALLOC_B]](%[[SHAPE]]) {cuda_attr = #fir.cuda<managed>, uniq_name = "_QMcuda_varFcuda_alloc_freeEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! CHECK: %[[DECL_B:.*]]:2 = hlfir.declare %[[ALLOC_B]](%[[SHAPE]]) {data_attr = #cuf.cuda<managed>, uniq_name = "_QMcuda_varFcuda_alloc_freeEb"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 
-! CHECK: fir.cuda_free %[[DECL_B]]#1 : !fir.ref<!fir.array<?xf32>> {cuda_attr = #fir.cuda<managed>}
-! CHECK: fir.cuda_free %[[DECL_U]]#1 : !fir.ref<i32> {cuda_attr = #fir.cuda<unified>}
-! CHECK: fir.cuda_free %[[DECL_A]]#1 : !fir.ref<!fir.array<10xf32>> {cuda_attr = #fir.cuda<device>}
+! CHECK: cuf.free %[[DECL_B]]#1 : !fir.ref<!fir.array<?xf32>> {data_attr = #cuf.cuda<managed>}
+! CHECK: cuf.free %[[DECL_U]]#1 : !fir.ref<i32> {data_attr = #cuf.cuda<unified>}
+! CHECK: cuf.free %[[DECL_A]]#1 : !fir.ref<!fir.array<10xf32>> {data_attr = #cuf.cuda<device>}
 
 subroutine dummy(x)
   real, target, device :: x
 end subroutine
 
 ! CHECK: func.func @_QMcuda_varPdummy
-! CHECK-NOT: fir.cuda_free
+! CHECK-NOT: cuf.free
 
 end module
 
diff --git a/flang/test/Lower/CUDA/cuda-data-transfer.cuf b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
index 0a2608639bce..084314ed63ec 100644
--- a/flang/test/Lower/CUDA/cuda-data-transfer.cuf
+++ b/flang/test/Lower/CUDA/cuda-data-transfer.cuf
@@ -29,37 +29,37 @@ end
 
 ! CHECK-LABEL: func.func @_QPsub1()
 
-! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub1Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub1Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub1Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
 ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[LOADED_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32>
 ! CHECK: %[[ADD:.*]] = arith.addi %[[C1]], %[[LOADED_I]] : i32
 ! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ADD]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
 
 ! CHECK: %[[C1:.*]] = arith.constant 1 : i32
 ! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[C1]] {uniq_name = ".cuf_host_tmp"} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1)
-! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
+! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[M]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<i32>, !fir.ref<i32>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<i32>, i1
 
-! CHECK: fir.cuda_data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.data_transfer %[[AHOST]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 
 ! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32> {
 ! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
-! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
 
 ! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}})  shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
 ! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%c1{{.*}}:%c5{{.*}}:%c1{{.*}})  shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
-! CHECK: fir.cuda_data_transfer %[[DES_AHOST]] to %[[DES_ADEV]] {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
+! CHECK: cuf.data_transfer %[[DES_AHOST]] to %[[DES_ADEV]] {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
 
 ! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32>
 ! CHECK: %[[ASSOC:.*]]:3 = hlfir.associate %[[ELEMENTAL]](%{{.*}}) {uniq_name = ".cuf_host_tmp"} : (!hlfir.expr<10xi32>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>, i1)
-! CHECK: fir.cuda_data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #fir.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.data_transfer %[[ASSOC]]#0 to %[[ADEV]]#0 {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 ! CHECK: hlfir.end_associate %[[ASSOC]]#1, %[[ASSOC]]#2 : !fir.ref<!fir.array<10xi32>>, i1
 
 subroutine sub2()
@@ -81,25 +81,25 @@ subroutine sub2()
 end
 
 ! CHECK-LABEL: func.func @_QPsub2()
-! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[ADEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub2Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-! CHECK: %[[BDEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Ebdev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+! CHECK: %[[BDEV:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub2Ebdev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %[[BHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub2Ebhost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %[[I:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFsub2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: fir.cuda_data_transfer %[[ADEV]]#0 to %[[AHOST]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
-! CHECK: fir.cuda_data_transfer %[[M]]#0 to %[[I]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<i32>, !fir.ref<i32>
+! CHECK: %[[M:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub2Em"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: cuf.data_transfer %[[ADEV]]#0 to %[[AHOST]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.data_transfer %[[M]]#0 to %[[I]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<i32>, !fir.ref<i32>
 
 ! CHECK: %[[DES_ADEV:.*]] = hlfir.designate %[[ADEV]]#0 (%{{.*}}:%{{.*}}:%{{.*}})  shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
 ! CHECK: %[[DES_AHOST:.*]] = hlfir.designate %[[AHOST]]#0 (%{{.*}}:%{{.*}}:%{{.*}})  shape %{{.*}} : (!fir.ref<!fir.array<10xi32>>, index, index, index, !fir.shape<1>) -> !fir.ref<!fir.array<5xi32>>
-! CHECK: fir.cuda_data_transfer %[[DES_ADEV]] to %[[DES_AHOST]] {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
+! CHECK: cuf.data_transfer %[[DES_ADEV]] to %[[DES_AHOST]] {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>
 
-! CHECK: fir.cuda_data_transfer %[[ADEV]]#0 to %[[BDEV]]#0 {transfer_kind = #fir.cuda_transfer<device_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
+! CHECK: cuf.data_transfer %[[ADEV]]#0 to %[[BDEV]]#0 {transfer_kind = #cuf.cuda_transfer<device_device>} : !fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>
 
 ! CHECK: %[[TEMP:.*]] = fir.allocmem !fir.array<10xi32> {bindc_name = ".tmp", uniq_name = ""}
 ! CHECK: %[[DECL_TEMP:.*]]:2 = hlfir.declare %[[TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
-! CHECK: %[[ADEV_TEMP:.*]]:2 = hlfir.declare %[[DECL_TEMP]]#1(%{{.*}}) {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
-! CHECK: fir.cuda_data_transfer %[[ADEV]]#1 to %[[DECL_TEMP]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>
+! CHECK: %[[ADEV_TEMP:.*]]:2 = hlfir.declare %[[DECL_TEMP]]#1(%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub2Eadev"} : (!fir.heap<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.heap<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>)
+! CHECK: cuf.data_transfer %[[ADEV]]#1 to %[[DECL_TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10xi32>>, !fir.heap<!fir.array<10xi32>>
 ! CHECK: %[[ELEMENTAL:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr<10xi32>
 ! CHECK: hlfir.assign %[[ELEMENTAL]] to %[[BHOST]]#0 : !hlfir.expr<10xi32>, !fir.ref<!fir.array<10xi32>>
 ! CHECK: fir.freemem %[[DECL_TEMP]]#0 : !fir.heap<!fir.array<10xi32>>
@@ -116,12 +116,12 @@ end
 ! CHECK: %[[TMP:.*]] = fir.alloca !fir.type<_QMmod1Tt1{i:i32}> {bindc_name = ".tmp"}
 ! CHECK: %[[AHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub3Eahost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 ! CHECK: %[[BHOST:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub3Ebhost"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
-! CHECK: %[[T:.*]]:2 = hlfir.declare %7 {cuda_attr = #fir.cuda<device>, uniq_name = "_QFsub3Et"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
+! CHECK: %[[T:.*]]:2 = hlfir.declare %7 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub3Et"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
 ! CHECK: %[[TMP_DECL:.*]]:2 = hlfir.declare %0 {uniq_name = ".tmp"} : (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>) -> (!fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>)
-! CHECK: fir.cuda_data_transfer %[[T]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #fir.cuda_transfer<device_host>} : !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>
+! CHECK: cuf.data_transfer %[[T]]#1 to %[[TMP_DECL]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>, !fir.ref<!fir.type<_QMmod1Tt1{i:i32}>>
 
 
-! Check that fir.cuda_data_transfer are not generated within cuf kernel
+! Check that cuf.data_transfer are not generated within cuf kernel
 subroutine sub4()
   integer, parameter :: n = 10
   real, device :: adev(n)
@@ -137,9 +137,9 @@ subroutine sub4()
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub4()
-! CHECK: fir.cuda_data_transfer
-! CHECK: fir.cuda_kernel<<<*, *>>>
-! CHECK-NOT: fir.cuda_data_transfer
+! CHECK: cuf.data_transfer
+! CHECK: cuf.kernel<<<*, *>>>
+! CHECK-NOT: cuf.data_transfer
 ! CHECK: hlfir.assign
 
 attributes(global) subroutine sub5(a)
@@ -149,7 +149,7 @@ attributes(global) subroutine sub5(a)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub5
-! CHECK-NOT: fir.cuda_data_transfer
+! CHECK-NOT: cuf.data_transfer
 
 attributes(host,device) subroutine sub6(a)
   integer, device :: a
@@ -158,4 +158,4 @@ attributes(host,device) subroutine sub6(a)
 end subroutine
 
 ! CHECK-LABEL: func.func @_QPsub6
-! CHECK: fir.cuda_data_transfer
+! CHECK: cuf.data_transfer
diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
index 7e28fbb2231a..82d1a61f8e15 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -16,10 +16,10 @@ contains
   subroutine host()
     real, device :: a
 ! CHECK-LABEL: func.func @_QMtest_callPhost()
-! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {cuda_attr = #fir.cuda<device>, uniq_name = "_QMtest_callFhostEa"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMtest_callFhostEa"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
     call dev_kernel0<<<10, 20>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}>>>()
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}>>>()
 
     call dev_kernel0<<< __builtin_dim3(1,1,4), __builtin_dim3(32,1,1) >>>
 ! CHECK: %[[ADDR_DIM3_GRID:.*]] = fir.address_of(@_QQro._QM__fortran_builtinsT__builtin_dim3.{{.*}}) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
@@ -38,16 +38,16 @@ contains
 ! CHECK: %[[BLOCK_Y_LOAD:.*]] = fir.load %[[BLOCK_Y]] : !fir.ref<i32>
 ! CHECK: %[[BLOCK_Z:.*]] = hlfir.designate %[[DIM3_BLOCK]]#1{"z"}   : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
 ! CHECK: %[[BLOCK_Z_LOAD:.*]] = fir.load %[[BLOCK_Z]] : !fir.ref<i32>
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%[[GRID_X_LOAD]], %[[GRID_Y_LOAD]], %[[GRID_Z_LOAD]], %[[BLOCK_X_LOAD]], %[[BLOCK_Y_LOAD]], %[[BLOCK_Z_LOAD]]>>>()
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%[[GRID_X_LOAD]], %[[GRID_Y_LOAD]], %[[GRID_Z_LOAD]], %[[BLOCK_X_LOAD]], %[[BLOCK_Y_LOAD]], %[[BLOCK_Z_LOAD]]>>>()
 
     call dev_kernel0<<<10, 20, 2>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>() 
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>() 
 
     call dev_kernel0<<<10, 20, 2, 0>>>()
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
 
     call dev_kernel1<<<1, 32>>>(a)
-! CHECK: fir.cuda_kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1) : (!fir.ref<f32>)
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%1#1) : (!fir.ref<f32>)
   end
 
 end
diff --git a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
index e1cc35772618..89de367b723f 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-loop-directive.cuf
@@ -20,7 +20,7 @@ subroutine sub1()
 ! CHECK: %[[LB:.*]] = fir.convert %c1{{.*}} : (i32) -> index
 ! CHECK: %[[UB:.*]] = fir.convert %c100{{.*}} : (i32) -> index
 ! CHECK: %[[STEP:.*]] = arith.constant 1 : index
-! CHECK: fir.cuda_kernel<<<%c1_i32, %c2_i32>>> (%[[ARG0:.*]] : index) = (%[[LB]] : index) to (%[[UB]] : index)  step (%[[STEP]] : index)
+! CHECK: cuf.kernel<<<%c1_i32, %c2_i32>>> (%[[ARG0:.*]] : index) = (%[[LB]] : index) to (%[[UB]] : index)  step (%[[STEP]] : index)
 ! CHECK-NOT: fir.do_loop
 ! CHECK: %[[ARG0_I32:.*]] = fir.convert %[[ARG0]] : (index) -> i32
 ! CHECK: fir.store %[[ARG0_I32]] to %[[IV]]#1 : !fir.ref<i32>
@@ -32,7 +32,7 @@ subroutine sub1()
     a(i) = a(i) * b(i)
   end do
 
-! CHECK: fir.cuda_kernel<<<*, *>>> (%{{.*}} : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index)
+! CHECK: cuf.kernel<<<*, *>>> (%{{.*}} : index) = (%{{.*}} : index) to (%{{.*}} : index)  step (%{{.*}} : index)
 
   !$cuf kernel do(2) <<< 1, (256,1) >>>
   do i = 1, n
@@ -41,7 +41,7 @@ subroutine sub1()
     end do
   end do
 
-! CHECK: fir.cuda_kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%[[ARG0:.*]] : index, %[[ARG1:.*]] : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: cuf.kernel<<<%c1{{.*}}, (%c256{{.*}}, %c1{{.*}})>>> (%[[ARG0:.*]] : index, %[[ARG1:.*]] : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index) step (%{{.*}}, %{{.*}} : index, index)
 ! CHECK: %[[ARG0_I32:.*]] = fir.convert %[[ARG0]] : (index) -> i32
 ! CHECK: fir.store %[[ARG0_I32]] to %[[IV]]#1 : !fir.ref<i32>
 ! CHECK: %[[ARG1_I32:.*]] = fir.convert %[[ARG1]] : (index) -> i32
@@ -54,7 +54,7 @@ subroutine sub1()
       c(i,j) = c(i,j) * d(i,j)
     end do
   end do
-! CHECK: fir.cuda_kernel<<<(%c1{{.*}}, %c0{{.*}}), (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: cuf.kernel<<<(%c1{{.*}}, %c0{{.*}}), (%c256{{.*}}, %c1{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
 
 !$cuf kernel do(2) <<< (*,*), (32,4) >>>
   do i = 1, n
@@ -63,5 +63,5 @@ subroutine sub1()
     end do
   end do
 
-! CHECK: fir.cuda_kernel<<<*, (%c32{{.*}}, %c4{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
+! CHECK: cuf.kernel<<<*, (%c32{{.*}}, %c4{{.*}})>>> (%{{.*}} : index, %{{.*}} : index) = (%{{.*}}, %{{.*}} : index, index) to (%{{.*}}, %{{.*}} : index, index)  step (%{{.*}}, %{{.*}} : index, index)
 end
diff --git a/flang/test/Lower/CUDA/cuda-mod.cuf b/flang/test/Lower/CUDA/cuda-mod.cuf
index ae5bf63d2da4..f03e72e94780 100644
--- a/flang/test/Lower/CUDA/cuda-mod.cuf
+++ b/flang/test/Lower/CUDA/cuda-mod.cuf
@@ -10,6 +10,6 @@ contains
   end
 end module
 
-! CHECK: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+! CHECK: fir.global @_QMcuf_modEmd {data_attr = #cuf.cuda<device>} : f32
 
-! CHECK: func.func @_QMcuf_modPdevsub() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK: func.func @_QMcuf_modPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
diff --git a/flang/test/Lower/CUDA/cuda-module-use.cuf b/flang/test/Lower/CUDA/cuda-module-use.cuf
index 47d3805065a5..130fefab24d9 100644
--- a/flang/test/Lower/CUDA/cuda-module-use.cuf
+++ b/flang/test/Lower/CUDA/cuda-module-use.cuf
@@ -10,16 +10,16 @@ end
 
 ! CHECK-LABEL: func.func @_QPsub1()
 ! CHECK: %[[ADDR:.*]] = fir.address_of(@_QMcuf_modEmd) : !fir.ref<f32>
-! CHECK: %{{.*}}:2 = hlfir.declare %[[ADDR]] {cuda_attr = #fir.cuda<device>, uniq_name = "_QMcuf_modEmd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK: %{{.*}}:2 = hlfir.declare %[[ADDR]] {data_attr = #cuf.cuda<device>, uniq_name = "_QMcuf_modEmd"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 attributes(device) subroutine sub2()
   use cuf_mod
   call devsub()
 end
 
-! CHECK-LABEL: func.func @_QPsub2() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK-LABEL: func.func @_QPsub2() attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
 ! CHECK: fir.call @_QMcuf_modPdevsub()
 
-! CHECK-LABEL: fir.global @_QMcuf_modEmd {cuda_attr = #fir.cuda<device>} : f32
+! CHECK-LABEL: fir.global @_QMcuf_modEmd {data_attr = #cuf.cuda<device>} : f32
 
-! CHECK-LABEL: func.func private @_QMcuf_modPdevsub() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK-LABEL: func.func private @_QMcuf_modPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
diff --git a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
index d9765f6cd2fe..f8b8dd8e296b 100644
--- a/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
+++ b/flang/test/Lower/CUDA/cuda-proc-attribute.cuf
@@ -4,40 +4,40 @@
 ! Test lowering of CUDA attribute on procedures.
 
 attributes(host) subroutine sub_host(); end
-! CHECK: func.func @_QPsub_host() attributes {fir.cuda_attr = #fir.cuda_proc<host>}
+! CHECK: func.func @_QPsub_host() attributes {cuf.proc_attr = #cuf.cuda_proc<host>}
 
 attributes(device) subroutine sub_device(); end
-! CHECK: func.func @_QPsub_device() attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK: func.func @_QPsub_device() attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
 
 attributes(host) attributes(device) subroutine sub_host_device; end
-! CHECK: func.func @_QPsub_host_device() attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+! CHECK: func.func @_QPsub_host_device() attributes {cuf.proc_attr = #cuf.cuda_proc<host_device>}
 
 attributes(device) attributes(host) subroutine sub_device_host; end
-! CHECK: func.func @_QPsub_device_host() attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+! CHECK: func.func @_QPsub_device_host() attributes {cuf.proc_attr = #cuf.cuda_proc<host_device>}
 
 attributes(global) subroutine sub_global(); end
-! CHECK: func.func @_QPsub_global() attributes {fir.cuda_attr = #fir.cuda_proc<global>}
+! CHECK: func.func @_QPsub_global() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
 
 attributes(grid_global) subroutine sub_grid_global(); end
-! CHECK: func.func @_QPsub_grid_global() attributes {fir.cuda_attr = #fir.cuda_proc<grid_global>}
+! CHECK: func.func @_QPsub_grid_global() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>}
 
 attributes(host) integer function fct_host(); end
-! CHECK: func.func @_QPfct_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host>}
+! CHECK: func.func @_QPfct_host() -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<host>}
 
 attributes(device) integer function fct_device(); end
-! CHECK: func.func @_QPfct_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<device>}
+! CHECK: func.func @_QPfct_device() -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<device>}
 
 attributes(host) attributes(device) integer function fct_host_device; end
-! CHECK: func.func @_QPfct_host_device() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+! CHECK: func.func @_QPfct_host_device() -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<host_device>}
 
 attributes(device) attributes(host) integer function fct_device_host; end
-! CHECK: func.func @_QPfct_device_host() -> i32 attributes {fir.cuda_attr = #fir.cuda_proc<host_device>}
+! CHECK: func.func @_QPfct_device_host() -> i32 attributes {cuf.proc_attr = #cuf.cuda_proc<host_device>}
 
 attributes(global) launch_bounds(1, 2) subroutine sub_lbounds1(); end
-! CHECK: func.func @_QPsub_lbounds1() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64>}
+! CHECK: func.func @_QPsub_lbounds1() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>}
 
 attributes(global) launch_bounds(1, 2, 3) subroutine sub_lbounds2(); end
-! CHECK: func.func @_QPsub_lbounds2() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_launch_bounds = #fir.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>}
+! CHECK: func.func @_QPsub_lbounds2() attributes {cuf.launch_bounds = #cuf.launch_bounds<maxTPB = 1 : i64, minBPM = 2 : i64, upperBoundClusterSize = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>}
 
 attributes(global) cluster_dims(1, 2, 3) subroutine sub_clusterdims1(); end
-! CHECK: func.func @_QPsub_clusterdims1() attributes {fir.cuda_attr = #fir.cuda_proc<global>, fir.cuda_cluster_dims = #fir.cluster_dims<x = 1 : i64, y = 2 : i64, z = 3 : i64>}
+! CHECK: func.func @_QPsub_clusterdims1() attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 1 : i64, y = 2 : i64, z = 3 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>}
diff --git a/flang/tools/bbc/CMakeLists.txt b/flang/tools/bbc/CMakeLists.txt
index f21fa3b7bae3..9410fd005660 100644
--- a/flang/tools/bbc/CMakeLists.txt
+++ b/flang/tools/bbc/CMakeLists.txt
@@ -16,6 +16,8 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
 
 target_link_libraries(bbc PRIVATE
+CUFAttrs
+CUFDialect
 FIRDialect
 FIRDialectSupport
 FIRSupport
diff --git a/flang/tools/fir-opt/CMakeLists.txt b/flang/tools/fir-opt/CMakeLists.txt
index 43b0c74696f5..43679a9d5357 100644
--- a/flang/tools/fir-opt/CMakeLists.txt
+++ b/flang/tools/fir-opt/CMakeLists.txt
@@ -11,6 +11,8 @@ if(FLANG_INCLUDE_TESTS)
 endif()
 
 target_link_libraries(fir-opt PRIVATE
+  CUFAttrs
+  CUFDialect
   FIRDialect
   FIRSupport
   FIRTransforms
diff --git a/flang/tools/tco/CMakeLists.txt b/flang/tools/tco/CMakeLists.txt
index 6d83353b4e0d..808219ac361f 100644
--- a/flang/tools/tco/CMakeLists.txt
+++ b/flang/tools/tco/CMakeLists.txt
@@ -7,6 +7,8 @@ llvm_update_compile_flags(tco)
 get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
 target_link_libraries(tco PRIVATE
+  CUFAttrs
+  CUFDialect
   FIRCodeGen
   FIRDialect
   FIRDialectSupport
diff --git a/flang/unittests/Optimizer/CMakeLists.txt b/flang/unittests/Optimizer/CMakeLists.txt
index 9c165d998e2e..7299e3ee0529 100644
--- a/flang/unittests/Optimizer/CMakeLists.txt
+++ b/flang/unittests/Optimizer/CMakeLists.txt
@@ -2,6 +2,7 @@ get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
 
 set(LIBS
+  CUFDialect
   FIRBuilder
   FIRCodeGen
   FIRDialect
@@ -36,6 +37,7 @@ add_flang_unittest(FlangOptimizerTests
   KindMappingTest.cpp
   RTBuilder.cpp
 DEPENDS
+  CUFDialect
   FIRDialect
   FIRSupport
   HLFIRDialect
diff --git a/flang/unittests/Optimizer/FortranVariableTest.cpp b/flang/unittests/Optimizer/FortranVariableTest.cpp
index f5f559ef887c..87efb624735c 100644
--- a/flang/unittests/Optimizer/FortranVariableTest.cpp
+++ b/flang/unittests/Optimizer/FortranVariableTest.cpp
@@ -51,7 +51,7 @@ TEST_F(FortranVariableTest, SimpleScalar) {
       /*shape=*/mlir::Value{}, /*typeParams=*/std::nullopt,
       /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -77,7 +77,7 @@ TEST_F(FortranVariableTest, CharacterScalar) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       /*shape=*/mlir::Value{}, typeParams, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_FALSE(fortranVariable.isArray());
@@ -108,7 +108,7 @@ TEST_F(FortranVariableTest, SimpleArray) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, /*typeParams*/ std::nullopt, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());
@@ -139,7 +139,7 @@ TEST_F(FortranVariableTest, CharacterArray) {
   auto declare = builder->create<fir::DeclareOp>(loc, addr.getType(), addr,
       shape, typeParams, /*dummy_scope=*/nullptr, name,
       /*fortran_attrs=*/fir::FortranVariableFlagsAttr{},
-      /*cuda_attr=*/fir::CUDADataAttributeAttr{});
+      /*data_attr=*/cuf::DataAttributeAttr{});
 
   fir::FortranVariableOpInterface fortranVariable = declare;
   EXPECT_TRUE(fortranVariable.isArray());