Share CUBLAS handle between different CUBLAS matrices (#543)

* share cublas handle between different cuda matrices * add the singleton class implementation, does not work * update the cmake file for different NVIDIA GPU types * set the CMAKE_CUDA_ARCHITECTURES correctly with input GPU type * remove debugging statement and add one comment * clean up the code for cublas handle * update the cublas singleton class * remove duplicated code * fix an env variable bug and clean up more code * clean up unused variable and fix a typo * add std::map to replace the env variables check enforce the c++20 for CUDA code * some minor typo fixes * change the location of the cublas handle * remove the if scope for the mutex lock * query device id in the cublas singleton class * change singleton class to a static std::map variable * use smart pointer to deallocate the cublas handle explicitly --------- Co-authored-by: Jian Sun <[email protected]>
NCAR · Jun 13, 2024 · 1fa0bac · 1fa0bac
1 parent f8f46df
commit 1fa0bac
Show file tree

Hide file tree

Showing 8 changed files with 49 additions and 35 deletions.
diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake
@@ -142,6 +142,7 @@ if(NOT ${MICM_GPU_TYPE} STREQUAL "None")
   enable_language(CUDA)
   find_package(CUDAToolkit REQUIRED)
   set(MICM_ENABLE_CUDA ON)
+  set(CUDA_STANDARD_REQUIRED ON)
 endif()
 
 ################################################################################

diff --git a/include/micm/solver/cuda_rosenbrock.cuh b/include/micm/solver/cuda_rosenbrock.cuh
@@ -42,7 +42,6 @@ namespace micm
         const CudaMatrixParam& y_new_param,
         const CudaMatrixParam& errors_param,
         const RosenbrockSolverParameters& ros_param,
-        cublasHandle_t handle,
         CudaRosenbrockSolverParam devstruct);
   }  // namespace cuda
 }  // namespace micm
diff --git a/include/micm/solver/cuda_rosenbrock.hpp b/include/micm/solver/cuda_rosenbrock.hpp
@@ -136,13 +136,11 @@ namespace micm
     double NormalizedError(const DenseMatrixPolicy& y_old, const DenseMatrixPolicy& y_new, const DenseMatrixPolicy& errors)
         const requires(CudaMatrix<DenseMatrixPolicy>&& VectorizableDense<DenseMatrixPolicy>)
     {
-      // At this point, it does not matter which handle we use; may revisit it when we have a multi-node-multi-GPU test
       return micm::cuda::NormalizedErrorDriver(
           y_old.AsDeviceParam(),
           y_new.AsDeviceParam(),
           errors.AsDeviceParam(),
           this->parameters_,
-          errors.AsCublasHandle(),
           this->devstruct_);
     }
 

diff --git a/include/micm/util/cuda_dense_matrix.hpp b/include/micm/util/cuda_dense_matrix.hpp
@@ -6,7 +6,6 @@
 #include <micm/util/cuda_util.cuh>
 #include <micm/util/error.hpp>
 #include <micm/util/vector_matrix.hpp>
-
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
@@ -66,8 +65,6 @@ namespace micm
    private:
     /// @brief The device pointer (handle) to the allocated memory on the target device.
     CudaMatrixParam param_;
-    /// @brief The handle to the CUBLAS library
-    cublasHandle_t handle_ = NULL;
 
    public:
     CudaDenseMatrix() requires(std::is_same_v<T, double>)
@@ -88,7 +85,6 @@ namespace micm
       this->param_.number_of_elements_ = this->data_.size();
       this->param_.number_of_grid_cells_ = x_dim;
       CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
-      CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
     }
     CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim)
         : VectorMatrix<T, L>(x_dim, y_dim)
@@ -101,7 +97,6 @@ namespace micm
       this->param_.number_of_elements_ = this->data_.size();
       this->param_.number_of_grid_cells_ = x_dim;
       CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
-      CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
     }
     CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim, T initial_value)
         : VectorMatrix<T, L>(x_dim, y_dim, initial_value)
@@ -118,7 +113,6 @@ namespace micm
         this->param_.number_of_elements_ += inner_vector.size();
       }
       CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
-      CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
     }
 
     CudaDenseMatrix(const std::vector<std::vector<T>> other)
@@ -135,7 +129,6 @@ namespace micm
       this->param_.number_of_grid_cells_ = other.param_.number_of_grid_cells_;
       CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
       CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
-      CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
     }
 
     CudaDenseMatrix(const CudaDenseMatrix& other)
@@ -148,7 +141,6 @@ namespace micm
     {
       this->param_.d_data_ = nullptr;
       std::swap(this->param_, other.param_);
-      std::swap(this->handle_, other.handle_);
     }
 
     CudaDenseMatrix& operator=(const CudaDenseMatrix& other)
@@ -159,7 +151,6 @@ namespace micm
       this->param_ = other.param_;
       CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
       CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
-      CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
       return *this;
     }
 
@@ -169,7 +160,6 @@ namespace micm
       {
         VectorMatrix<T, L>::operator=(other);
         std::swap(this->param_, other.param_);
-        std::swap(this->handle_, other.handle_);
       }
       return *this;
     }
@@ -181,12 +171,7 @@ namespace micm
     ~CudaDenseMatrix() requires(std::is_same_v<T, double>)
     {
       CHECK_CUDA_ERROR(micm::cuda::FreeVector(this->param_), "cudaFree");
-      if (this->handle_ != NULL)
-      {
-        cublasDestroy(this->handle_);
-      }
       this->param_.d_data_ = nullptr;
-      this->handle_ = NULL;
     }
 
     void CopyToDevice()
@@ -206,11 +191,6 @@ namespace micm
       return this->param_;
     }
 
-    cublasHandle_t AsCublasHandle() const
-    {
-      return this->handle_;
-    }
-
     /// @brief For each element in the VectorMatrix x and y, perform y = alpha * x + y,
     ///        where alpha is a scalar constant.
     /// @param alpha The scaling scalar to apply to the VectorMatrix x
@@ -223,8 +203,8 @@ namespace micm
       static_assert(std::is_same_v<T, double>);
       CHECK_CUBLAS_ERROR(
           cublasDaxpy(
-              this->handle_, x.param_.number_of_elements_, &alpha, x.param_.d_data_, incx, this->param_.d_data_, incy),
-          "CUBLAS Daxpy operation failed...");
+            micm::cuda::GetCublasHandle(), x.param_.number_of_elements_, &alpha, x.param_.d_data_, incx, this->param_.d_data_, incy),
+            "CUBLAS Daxpy operation failed...");
     }
 
     // Copy the device data from the other Cuda dense matrix into this one

diff --git a/include/micm/util/cuda_util.cuh b/include/micm/util/cuda_util.cuh
@@ -4,7 +4,6 @@
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
-
 #include <string>
 
 #define CHECK_CUDA_ERROR(err, msg)   micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg)
@@ -27,5 +26,8 @@ namespace micm
     /// @param line Line number where error occurred
     /// @param str Additional string to print with error message
     void CheckCublasError(cublasStatus_t err, const char* file, int line, std::string str);
+
+    /// @brief Get the cuBLAS handle for the current device
+    cublasHandle_t& GetCublasHandle();
   }  // namespace cuda
 }  // namespace micm
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -53,10 +53,10 @@ if(MICM_ENABLE_CUDA)
   target_link_libraries(micm_cuda
     PRIVATE micm
   )
-
   target_link_libraries(micm_cuda 
     PUBLIC CUDA::cudart CUDA::cublas
   )
+  set_property(TARGET micm_cuda PROPERTY CUDA_STANDARD 20)
 endif()
 
 if (MICM_ENABLE_PROFILE)

diff --git a/src/solver/rosenbrock.cu b/src/solver/rosenbrock.cu
@@ -251,7 +251,6 @@ namespace micm
         const CudaMatrixParam& y_new_param,
         const CudaMatrixParam& errors_param,
         const RosenbrockSolverParameters& ros_param,
-        cublasHandle_t handle,
         CudaRosenbrockSolverParam devstruct)
     {
       double normalized_error;
@@ -263,8 +262,9 @@ namespace micm
                           " but got: " + std::to_string(errors_param.number_of_elements_);
         INTERNAL_ERROR(msg.c_str());
       }
-      cudaError_t err = cudaMemcpy(
-          devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice);
+      CHECK_CUDA_ERROR(cudaMemcpy(
+          devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice),
+          "cudaMemcpy");
 
       if (number_of_elements > 1000000)
       {
@@ -273,11 +273,7 @@ namespace micm
         ScaledErrorKernel<<<number_of_blocks, BLOCK_SIZE>>>(y_old_param, y_new_param, ros_param, devstruct);
         // call cublas function to perform the norm:
         // https://docs.nvidia.com/cuda/cublas/index.html?highlight=dnrm2#cublas-t-nrm2
-        cublasStatus_t stat = cublasDnrm2(handle, number_of_elements, devstruct.errors_input_, 1, &normalized_error);
-        if (stat != CUBLAS_STATUS_SUCCESS)
-        {
-          ThrowInternalError(MicmInternalErrc::Cublas, __FILE__, __LINE__, cublasGetStatusString(stat));
-        }
+        CHECK_CUBLAS_ERROR(cublasDnrm2(micm::cuda::GetCublasHandle(), number_of_elements, devstruct.errors_input_, 1, &normalized_error), "cublasDnrm2");
         normalized_error = normalized_error * std::sqrt(1.0 / number_of_elements);
       }
       else

diff --git a/src/util/cuda_util.cu b/src/util/cuda_util.cu
@@ -4,6 +4,10 @@
 #include <micm/util/internal_error.hpp>
 
 #include <cuda_runtime.h>
+#include <memory>
+#include <mutex>
+#include <map>
+#include <cublas_v2.h>
 
 namespace micm
 {
@@ -26,5 +30,39 @@ namespace micm
         ThrowInternalError(MicmInternalErrc::Cublas, file, line, msg.c_str());
       }
     }
+
+    // Define a functor for the cublasHandle_t unique pointer deleter
+    struct CublasHandleDeleter {
+      void operator()(cublasHandle_t* handle) const {
+        if (handle != nullptr) {
+          CHECK_CUBLAS_ERROR(cublasDestroy(*handle), "CUBLAS finalization failed");
+          delete handle;
+        }
+      }
+    };
+
+    // Define the smart pointer type using the functor for the custom deleter
+    using CublasHandlePtr = std::unique_ptr<cublasHandle_t, CublasHandleDeleter>;
+
+    // Create a cublas handle and return a unique pointer to it
+    CublasHandlePtr CreateCublasHandle() {
+      cublasHandle_t* handle = new cublasHandle_t;
+      CHECK_CUBLAS_ERROR(cublasCreate(handle), "CUBLAS initialization failed...");
+      return CublasHandlePtr(handle, CublasHandleDeleter());
+    }
+
+    cublasHandle_t& GetCublasHandle()
+    {
+      static std::map<int, CublasHandlePtr> cublas_handles_map;
+      static std::mutex mutex;
+      int device_id;
+      CHECK_CUDA_ERROR(cudaGetDevice(&device_id), "Failed to get device ID...");
+      std::lock_guard<std::mutex> lock(mutex); // lock the mutex and generate a new cublas handle below
+      if (auto search = cublas_handles_map.find(device_id); search == cublas_handles_map.end())
+      {
+        cublas_handles_map[device_id] = std::move(CreateCublasHandle());
+      }
+      return *cublas_handles_map[device_id];
+    }
   }  // namespace cuda
 }  // namespace micm