diff --git a/cmake/dependencies.cmake b/cmake/dependencies.cmake index ef6f452a8..1223f687e 100644 --- a/cmake/dependencies.cmake +++ b/cmake/dependencies.cmake @@ -142,6 +142,7 @@ if(NOT ${MICM_GPU_TYPE} STREQUAL "None") enable_language(CUDA) find_package(CUDAToolkit REQUIRED) set(MICM_ENABLE_CUDA ON) + set(CUDA_STANDARD_REQUIRED ON) endif() ################################################################################ diff --git a/include/micm/solver/cuda_rosenbrock.cuh b/include/micm/solver/cuda_rosenbrock.cuh index 6bf38d59c..32ebf6e64 100644 --- a/include/micm/solver/cuda_rosenbrock.cuh +++ b/include/micm/solver/cuda_rosenbrock.cuh @@ -42,7 +42,6 @@ namespace micm const CudaMatrixParam& y_new_param, const CudaMatrixParam& errors_param, const RosenbrockSolverParameters& ros_param, - cublasHandle_t handle, CudaRosenbrockSolverParam devstruct); } // namespace cuda } // namespace micm \ No newline at end of file diff --git a/include/micm/solver/cuda_rosenbrock.hpp b/include/micm/solver/cuda_rosenbrock.hpp index 2033e06a0..efe61e7d6 100644 --- a/include/micm/solver/cuda_rosenbrock.hpp +++ b/include/micm/solver/cuda_rosenbrock.hpp @@ -136,13 +136,11 @@ namespace micm double NormalizedError(const DenseMatrixPolicy& y_old, const DenseMatrixPolicy& y_new, const DenseMatrixPolicy& errors) const requires(CudaMatrix&& VectorizableDense) { - // At this point, it does not matter which handle we use; may revisit it when we have a multi-node-multi-GPU test return micm::cuda::NormalizedErrorDriver( y_old.AsDeviceParam(), y_new.AsDeviceParam(), errors.AsDeviceParam(), this->parameters_, - errors.AsCublasHandle(), this->devstruct_); } diff --git a/include/micm/util/cuda_dense_matrix.hpp b/include/micm/util/cuda_dense_matrix.hpp index e2db3b9c1..5bae7c33c 100644 --- a/include/micm/util/cuda_dense_matrix.hpp +++ b/include/micm/util/cuda_dense_matrix.hpp @@ -6,7 +6,6 @@ #include #include #include - #include #include @@ -66,8 +65,6 @@ namespace micm private: /// @brief The device pointer (handle) to the allocated memory on the target device. CudaMatrixParam param_; - /// @brief The handle to the CUBLAS library - cublasHandle_t handle_ = NULL; public: CudaDenseMatrix() requires(std::is_same_v) @@ -88,7 +85,6 @@ namespace micm this->param_.number_of_elements_ = this->data_.size(); this->param_.number_of_grid_cells_ = x_dim; CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc"); - CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed..."); } CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim) : VectorMatrix(x_dim, y_dim) @@ -101,7 +97,6 @@ namespace micm this->param_.number_of_elements_ = this->data_.size(); this->param_.number_of_grid_cells_ = x_dim; CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc"); - CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed..."); } CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim, T initial_value) : VectorMatrix(x_dim, y_dim, initial_value) @@ -118,7 +113,6 @@ namespace micm this->param_.number_of_elements_ += inner_vector.size(); } CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc"); - CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed..."); } CudaDenseMatrix(const std::vector> other) @@ -135,7 +129,6 @@ namespace micm this->param_.number_of_grid_cells_ = other.param_.number_of_grid_cells_; CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc"); CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice"); - CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed..."); } CudaDenseMatrix(const CudaDenseMatrix& other) @@ -148,7 +141,6 @@ namespace micm { this->param_.d_data_ = nullptr; std::swap(this->param_, other.param_); - std::swap(this->handle_, other.handle_); } CudaDenseMatrix& operator=(const CudaDenseMatrix& other) @@ -159,7 +151,6 @@ namespace micm this->param_ = other.param_; CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc"); CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice"); - CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed..."); return *this; } @@ -169,7 +160,6 @@ namespace micm { VectorMatrix::operator=(other); std::swap(this->param_, other.param_); - std::swap(this->handle_, other.handle_); } return *this; } @@ -181,12 +171,7 @@ namespace micm ~CudaDenseMatrix() requires(std::is_same_v) { CHECK_CUDA_ERROR(micm::cuda::FreeVector(this->param_), "cudaFree"); - if (this->handle_ != NULL) - { - cublasDestroy(this->handle_); - } this->param_.d_data_ = nullptr; - this->handle_ = NULL; } void CopyToDevice() @@ -206,11 +191,6 @@ namespace micm return this->param_; } - cublasHandle_t AsCublasHandle() const - { - return this->handle_; - } - /// @brief For each element in the VectorMatrix x and y, perform y = alpha * x + y, /// where alpha is a scalar constant. /// @param alpha The scaling scalar to apply to the VectorMatrix x @@ -223,8 +203,8 @@ namespace micm static_assert(std::is_same_v); CHECK_CUBLAS_ERROR( cublasDaxpy( - this->handle_, x.param_.number_of_elements_, &alpha, x.param_.d_data_, incx, this->param_.d_data_, incy), - "CUBLAS Daxpy operation failed..."); + micm::cuda::GetCublasHandle(), x.param_.number_of_elements_, &alpha, x.param_.d_data_, incx, this->param_.d_data_, incy), + "CUBLAS Daxpy operation failed..."); } // Copy the device data from the other Cuda dense matrix into this one diff --git a/include/micm/util/cuda_util.cuh b/include/micm/util/cuda_util.cuh index 8e09c9aeb..9f6b6e43e 100644 --- a/include/micm/util/cuda_util.cuh +++ b/include/micm/util/cuda_util.cuh @@ -4,7 +4,6 @@ #include #include - #include #define CHECK_CUDA_ERROR(err, msg) micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg) @@ -27,5 +26,8 @@ namespace micm /// @param line Line number where error occurred /// @param str Additional string to print with error message void CheckCublasError(cublasStatus_t err, const char* file, int line, std::string str); + + /// @brief Get the cuBLAS handle for the current device + cublasHandle_t& GetCublasHandle(); } // namespace cuda } // namespace micm \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 313367cb0..ae2ba7420 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -53,10 +53,10 @@ if(MICM_ENABLE_CUDA) target_link_libraries(micm_cuda PRIVATE micm ) - target_link_libraries(micm_cuda PUBLIC CUDA::cudart CUDA::cublas ) + set_property(TARGET micm_cuda PROPERTY CUDA_STANDARD 20) endif() if (MICM_ENABLE_PROFILE) diff --git a/src/solver/rosenbrock.cu b/src/solver/rosenbrock.cu index 85205c2b0..73616a5e8 100644 --- a/src/solver/rosenbrock.cu +++ b/src/solver/rosenbrock.cu @@ -251,7 +251,6 @@ namespace micm const CudaMatrixParam& y_new_param, const CudaMatrixParam& errors_param, const RosenbrockSolverParameters& ros_param, - cublasHandle_t handle, CudaRosenbrockSolverParam devstruct) { double normalized_error; @@ -263,8 +262,9 @@ namespace micm " but got: " + std::to_string(errors_param.number_of_elements_); INTERNAL_ERROR(msg.c_str()); } - cudaError_t err = cudaMemcpy( - devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice); + CHECK_CUDA_ERROR(cudaMemcpy( + devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice), + "cudaMemcpy"); if (number_of_elements > 1000000) { @@ -273,11 +273,7 @@ namespace micm ScaledErrorKernel<<>>(y_old_param, y_new_param, ros_param, devstruct); // call cublas function to perform the norm: // https://docs.nvidia.com/cuda/cublas/index.html?highlight=dnrm2#cublas-t-nrm2 - cublasStatus_t stat = cublasDnrm2(handle, number_of_elements, devstruct.errors_input_, 1, &normalized_error); - if (stat != CUBLAS_STATUS_SUCCESS) - { - ThrowInternalError(MicmInternalErrc::Cublas, __FILE__, __LINE__, cublasGetStatusString(stat)); - } + CHECK_CUBLAS_ERROR(cublasDnrm2(micm::cuda::GetCublasHandle(), number_of_elements, devstruct.errors_input_, 1, &normalized_error), "cublasDnrm2"); normalized_error = normalized_error * std::sqrt(1.0 / number_of_elements); } else diff --git a/src/util/cuda_util.cu b/src/util/cuda_util.cu index 7e350eac9..5dc05fd39 100644 --- a/src/util/cuda_util.cu +++ b/src/util/cuda_util.cu @@ -4,6 +4,10 @@ #include #include +#include +#include +#include +#include namespace micm { @@ -26,5 +30,39 @@ namespace micm ThrowInternalError(MicmInternalErrc::Cublas, file, line, msg.c_str()); } } + + // Define a functor for the cublasHandle_t unique pointer deleter + struct CublasHandleDeleter { + void operator()(cublasHandle_t* handle) const { + if (handle != nullptr) { + CHECK_CUBLAS_ERROR(cublasDestroy(*handle), "CUBLAS finalization failed"); + delete handle; + } + } + }; + + // Define the smart pointer type using the functor for the custom deleter + using CublasHandlePtr = std::unique_ptr; + + // Create a cublas handle and return a unique pointer to it + CublasHandlePtr CreateCublasHandle() { + cublasHandle_t* handle = new cublasHandle_t; + CHECK_CUBLAS_ERROR(cublasCreate(handle), "CUBLAS initialization failed..."); + return CublasHandlePtr(handle, CublasHandleDeleter()); + } + + cublasHandle_t& GetCublasHandle() + { + static std::map cublas_handles_map; + static std::mutex mutex; + int device_id; + CHECK_CUDA_ERROR(cudaGetDevice(&device_id), "Failed to get device ID..."); + std::lock_guard lock(mutex); // lock the mutex and generate a new cublas handle below + if (auto search = cublas_handles_map.find(device_id); search == cublas_handles_map.end()) + { + cublas_handles_map[device_id] = std::move(CreateCublasHandle()); + } + return *cublas_handles_map[device_id]; + } } // namespace cuda } // namespace micm