Skip to content

Commit

Permalink
Share CUBLAS handle between different CUBLAS matrices (#543)
Browse files Browse the repository at this point in the history
* share cublas handle between different cuda matrices

* add the singleton class implementation, does not work

* update the cmake file for different NVIDIA GPU types

* set the CMAKE_CUDA_ARCHITECTURES correctly with input GPU type

* remove debugging statement and add one comment

* clean up the code for cublas handle

* update the cublas singleton class

* remove duplicated code

* fix an env variable bug and clean up more code

* clean up unused variable and fix a typo

* add std::map to replace the env variables check
enforce the c++20 for CUDA code

* some minor typo fixes

* change the location of the cublas handle

* remove the if scope for the mutex lock

* query device id in the cublas singleton class

* change singleton class to a static std::map variable

* use smart pointer to deallocate the cublas handle explicitly

---------

Co-authored-by: Jian Sun <[email protected]>
  • Loading branch information
sjsprecious and sjsprecious authored Jun 13, 2024
1 parent f8f46df commit 1fa0bac
Show file tree
Hide file tree
Showing 8 changed files with 49 additions and 35 deletions.
1 change: 1 addition & 0 deletions cmake/dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ if(NOT ${MICM_GPU_TYPE} STREQUAL "None")
enable_language(CUDA)
find_package(CUDAToolkit REQUIRED)
set(MICM_ENABLE_CUDA ON)
set(CUDA_STANDARD_REQUIRED ON)
endif()

################################################################################
Expand Down
1 change: 0 additions & 1 deletion include/micm/solver/cuda_rosenbrock.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ namespace micm
const CudaMatrixParam& y_new_param,
const CudaMatrixParam& errors_param,
const RosenbrockSolverParameters& ros_param,
cublasHandle_t handle,
CudaRosenbrockSolverParam devstruct);
} // namespace cuda
} // namespace micm
2 changes: 0 additions & 2 deletions include/micm/solver/cuda_rosenbrock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,11 @@ namespace micm
double NormalizedError(const DenseMatrixPolicy& y_old, const DenseMatrixPolicy& y_new, const DenseMatrixPolicy& errors)
const requires(CudaMatrix<DenseMatrixPolicy>&& VectorizableDense<DenseMatrixPolicy>)
{
// At this point, it does not matter which handle we use; may revisit it when we have a multi-node-multi-GPU test
return micm::cuda::NormalizedErrorDriver(
y_old.AsDeviceParam(),
y_new.AsDeviceParam(),
errors.AsDeviceParam(),
this->parameters_,
errors.AsCublasHandle(),
this->devstruct_);
}

Expand Down
24 changes: 2 additions & 22 deletions include/micm/util/cuda_dense_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
#include <micm/util/cuda_util.cuh>
#include <micm/util/error.hpp>
#include <micm/util/vector_matrix.hpp>

#include <cublas_v2.h>
#include <cuda_runtime.h>

Expand Down Expand Up @@ -66,8 +65,6 @@ namespace micm
private:
/// @brief The device pointer (handle) to the allocated memory on the target device.
CudaMatrixParam param_;
/// @brief The handle to the CUBLAS library
cublasHandle_t handle_ = NULL;

public:
CudaDenseMatrix() requires(std::is_same_v<T, double>)
Expand All @@ -88,7 +85,6 @@ namespace micm
this->param_.number_of_elements_ = this->data_.size();
this->param_.number_of_grid_cells_ = x_dim;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
}
CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim)
: VectorMatrix<T, L>(x_dim, y_dim)
Expand All @@ -101,7 +97,6 @@ namespace micm
this->param_.number_of_elements_ = this->data_.size();
this->param_.number_of_grid_cells_ = x_dim;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
}
CudaDenseMatrix(std::size_t x_dim, std::size_t y_dim, T initial_value)
: VectorMatrix<T, L>(x_dim, y_dim, initial_value)
Expand All @@ -118,7 +113,6 @@ namespace micm
this->param_.number_of_elements_ += inner_vector.size();
}
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
}

CudaDenseMatrix(const std::vector<std::vector<T>> other)
Expand All @@ -135,7 +129,6 @@ namespace micm
this->param_.number_of_grid_cells_ = other.param_.number_of_grid_cells_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
}

CudaDenseMatrix(const CudaDenseMatrix& other)
Expand All @@ -148,7 +141,6 @@ namespace micm
{
this->param_.d_data_ = nullptr;
std::swap(this->param_, other.param_);
std::swap(this->handle_, other.handle_);
}

CudaDenseMatrix& operator=(const CudaDenseMatrix& other)
Expand All @@ -159,7 +151,6 @@ namespace micm
this->param_ = other.param_;
CHECK_CUDA_ERROR(micm::cuda::MallocVector(this->param_, this->param_.number_of_elements_), "cudaMalloc");
CHECK_CUDA_ERROR(micm::cuda::CopyToDeviceFromDevice(this->param_, other.param_), "cudaMemcpyDeviceToDevice");
CHECK_CUBLAS_ERROR(cublasCreate(&(this->handle_)), "CUBLAS initialization failed...");
return *this;
}

Expand All @@ -169,7 +160,6 @@ namespace micm
{
VectorMatrix<T, L>::operator=(other);
std::swap(this->param_, other.param_);
std::swap(this->handle_, other.handle_);
}
return *this;
}
Expand All @@ -181,12 +171,7 @@ namespace micm
~CudaDenseMatrix() requires(std::is_same_v<T, double>)
{
CHECK_CUDA_ERROR(micm::cuda::FreeVector(this->param_), "cudaFree");
if (this->handle_ != NULL)
{
cublasDestroy(this->handle_);
}
this->param_.d_data_ = nullptr;
this->handle_ = NULL;
}

void CopyToDevice()
Expand All @@ -206,11 +191,6 @@ namespace micm
return this->param_;
}

cublasHandle_t AsCublasHandle() const
{
return this->handle_;
}

/// @brief For each element in the VectorMatrix x and y, perform y = alpha * x + y,
/// where alpha is a scalar constant.
/// @param alpha The scaling scalar to apply to the VectorMatrix x
Expand All @@ -223,8 +203,8 @@ namespace micm
static_assert(std::is_same_v<T, double>);
CHECK_CUBLAS_ERROR(
cublasDaxpy(
this->handle_, x.param_.number_of_elements_, &alpha, x.param_.d_data_, incx, this->param_.d_data_, incy),
"CUBLAS Daxpy operation failed...");
micm::cuda::GetCublasHandle(), x.param_.number_of_elements_, &alpha, x.param_.d_data_, incx, this->param_.d_data_, incy),
"CUBLAS Daxpy operation failed...");
}

// Copy the device data from the other Cuda dense matrix into this one
Expand Down
4 changes: 3 additions & 1 deletion include/micm/util/cuda_util.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

#include <cublas_v2.h>
#include <cuda_runtime.h>

#include <string>

#define CHECK_CUDA_ERROR(err, msg) micm::cuda::CheckCudaError(err, __FILE__, __LINE__, msg)
Expand All @@ -27,5 +26,8 @@ namespace micm
/// @param line Line number where error occurred
/// @param str Additional string to print with error message
void CheckCublasError(cublasStatus_t err, const char* file, int line, std::string str);

/// @brief Get the cuBLAS handle for the current device
cublasHandle_t& GetCublasHandle();
} // namespace cuda
} // namespace micm
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,10 @@ if(MICM_ENABLE_CUDA)
target_link_libraries(micm_cuda
PRIVATE micm
)

target_link_libraries(micm_cuda
PUBLIC CUDA::cudart CUDA::cublas
)
set_property(TARGET micm_cuda PROPERTY CUDA_STANDARD 20)
endif()

if (MICM_ENABLE_PROFILE)
Expand Down
12 changes: 4 additions & 8 deletions src/solver/rosenbrock.cu
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ namespace micm
const CudaMatrixParam& y_new_param,
const CudaMatrixParam& errors_param,
const RosenbrockSolverParameters& ros_param,
cublasHandle_t handle,
CudaRosenbrockSolverParam devstruct)
{
double normalized_error;
Expand All @@ -263,8 +262,9 @@ namespace micm
" but got: " + std::to_string(errors_param.number_of_elements_);
INTERNAL_ERROR(msg.c_str());
}
cudaError_t err = cudaMemcpy(
devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice);
CHECK_CUDA_ERROR(cudaMemcpy(
devstruct.errors_input_, errors_param.d_data_, sizeof(double) * number_of_elements, cudaMemcpyDeviceToDevice),
"cudaMemcpy");

if (number_of_elements > 1000000)
{
Expand All @@ -273,11 +273,7 @@ namespace micm
ScaledErrorKernel<<<number_of_blocks, BLOCK_SIZE>>>(y_old_param, y_new_param, ros_param, devstruct);
// call cublas function to perform the norm:
// https://docs.nvidia.com/cuda/cublas/index.html?highlight=dnrm2#cublas-t-nrm2
cublasStatus_t stat = cublasDnrm2(handle, number_of_elements, devstruct.errors_input_, 1, &normalized_error);
if (stat != CUBLAS_STATUS_SUCCESS)
{
ThrowInternalError(MicmInternalErrc::Cublas, __FILE__, __LINE__, cublasGetStatusString(stat));
}
CHECK_CUBLAS_ERROR(cublasDnrm2(micm::cuda::GetCublasHandle(), number_of_elements, devstruct.errors_input_, 1, &normalized_error), "cublasDnrm2");
normalized_error = normalized_error * std::sqrt(1.0 / number_of_elements);
}
else
Expand Down
38 changes: 38 additions & 0 deletions src/util/cuda_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
#include <micm/util/internal_error.hpp>

#include <cuda_runtime.h>
#include <memory>
#include <mutex>
#include <map>
#include <cublas_v2.h>

namespace micm
{
Expand All @@ -26,5 +30,39 @@ namespace micm
ThrowInternalError(MicmInternalErrc::Cublas, file, line, msg.c_str());
}
}

// Define a functor for the cublasHandle_t unique pointer deleter
struct CublasHandleDeleter {
void operator()(cublasHandle_t* handle) const {
if (handle != nullptr) {
CHECK_CUBLAS_ERROR(cublasDestroy(*handle), "CUBLAS finalization failed");
delete handle;
}
}
};

// Define the smart pointer type using the functor for the custom deleter
using CublasHandlePtr = std::unique_ptr<cublasHandle_t, CublasHandleDeleter>;

// Create a cublas handle and return a unique pointer to it
CublasHandlePtr CreateCublasHandle() {
cublasHandle_t* handle = new cublasHandle_t;
CHECK_CUBLAS_ERROR(cublasCreate(handle), "CUBLAS initialization failed...");
return CublasHandlePtr(handle, CublasHandleDeleter());
}

cublasHandle_t& GetCublasHandle()
{
static std::map<int, CublasHandlePtr> cublas_handles_map;
static std::mutex mutex;
int device_id;
CHECK_CUDA_ERROR(cudaGetDevice(&device_id), "Failed to get device ID...");
std::lock_guard<std::mutex> lock(mutex); // lock the mutex and generate a new cublas handle below
if (auto search = cublas_handles_map.find(device_id); search == cublas_handles_map.end())
{
cublas_handles_map[device_id] = std::move(CreateCublasHandle());
}
return *cublas_handles_map[device_id];
}
} // namespace cuda
} // namespace micm

0 comments on commit 1fa0bac

Please sign in to comment.