Skip to content

Commit

Permalink
Merge pull request #149 from DrTimothyAldenDavis/master
Browse files Browse the repository at this point in the history
Master
  • Loading branch information
DrTimothyAldenDavis authored Jul 9, 2022
2 parents 1a654bb + 385333b commit 5e7c3ed
Show file tree
Hide file tree
Showing 125 changed files with 2,479 additions and 1,326 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ endif ( )
set ( CMAKE_MACOSX_RPATH TRUE )

# version of SuiteSparse:GraphBLAS
set ( GraphBLAS_DATE "June 17, 2022" )
set ( GraphBLAS_DATE "July 8, 2022" )
set ( GraphBLAS_VERSION_MAJOR 7 )
set ( GraphBLAS_VERSION_MINOR 1 )
set ( GraphBLAS_VERSION_SUB 1 )
set ( GraphBLAS_VERSION_SUB 2 )

message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} " date: " ${GraphBLAS_DATE} )

Expand Down
28 changes: 22 additions & 6 deletions CUDA/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,24 @@ set(GRAPHBLAS_CUDA_INCLUDES

message(STATUS "GraphBLAS CUDA includes: " "${GRAPHBLAS_CUDA_INCLUDES}")

target_include_directories(graphblascuda PUBLIC ${CUDAToolkit_INCLUDE_DIRS} ${GRAPHBLAS_CUDA_INCLUDES})
set(EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)

IF(NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY})
file(MAKE_DIRECTORY ${EXTERNAL_INCLUDES_DIRECTORY})
endif()

IF(NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/cuco)
execute_process(
COMMAND git clone "https://github.com/NVIDIA/cuCollections.git" --branch main --recursive cuco
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
endif()

include_directories(${CMAKE_CURRENT_BINARY_DIR}/external_includes/cuco/include)

target_include_directories(graphblascuda PUBLIC
${CMAKE_CURRENT_BINARY_DIR}/external_includes/cuco/include
${CUDAToolkit_INCLUDE_DIRS}
${GRAPHBLAS_CUDA_INCLUDES})
set_target_properties(graphblascuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(graphblascuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(graphblascuda PROPERTIES CUDA_ARCHITECTURES "75")
Expand All @@ -46,7 +63,6 @@ install ( TARGETS graphblascuda
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} )


# 1. Execute enumify/stringify/jitify logic to compile ptx kernels and compile/link w/ relevant *.cu files.

# TODO: Need to do this piece in cmake
Expand All @@ -58,10 +74,10 @@ set(CUDA_TEST_SUITES
)

#
set(CUDA_TEST_MONOIDS PLUS) #MIN MAX TIMES ANY)
set(CUDA_TEST_BINOPS TIMES) #PLUS MIN MAX DIV MINUS RDIV RMINUS FIRST SECOND PAIR)
set(CUDA_TEST_SEMIRINGS PLUS_TIMES) # MIN_PLUS MAX_PLUS)
set(CUDA_TEST_DATATYPES int32_t ) #int64_t uint32_t uint64_t float double)
set(CUDA_TEST_MONOIDS PLUS MIN MAX) # TIMES ANY)
set(CUDA_TEST_BINOPS TIMES PLUS MIN MAX DIV) #MINUS RDIV RMINUS FIRST SECOND PAIR)
set(CUDA_TEST_SEMIRINGS PLUS_TIMES MIN_PLUS MAX_PLUS)
set(CUDA_TEST_DATATYPES int32_t int64_t uint32_t uint64_t float double)
set(CUDA_TEST_KERNELS vsvs) # mp vsvs dndn spdn vssp)


Expand Down
110 changes: 80 additions & 30 deletions CUDA/GB_AxB_dot3_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ extern "C"

#include "jitFactory.hpp"
#include "GB_cuda_type_wrap.hpp"
#include "test/GpuTimer.h"

template<typename T, typename I>
void print_array(void *arr, I size, const char *name) {
Expand Down Expand Up @@ -68,6 +69,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
cudaStream_t stream;
CHECK_CUDA_SIMPLE(cudaStreamCreate(&stream));

GpuTimer kernel_timer;

//--------------------------------------------------------------------------
// check inputs
//--------------------------------------------------------------------------
Expand Down Expand Up @@ -149,6 +152,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
M_sparsity, false, M->hyper_switch, cnvec,
cnz+1, // add one to cnz for GB_cumsum of Cwork
true, C_iso, Context) ;

if (info != GrB_SUCCESS)
{
// out of memory
Expand All @@ -173,12 +177,39 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
cudaMemAdviseSetPreferredLocation, device));
}

//--------------------------------------------------------------------------
// Pre-fetch arrays that will be used on the device
//--------------------------------------------------------------------------

// prefetch M
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t),
device, stream)) ; //stream_data) ;
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t),
device, stream )) ; //stream_data) ;
if (!(Mask_struct || M->iso))
{
// prefetch M->x only if the mask is valued and M is non-iso
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size,
device, stream )) ; //stream_data) ;
}

// prefetch C
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
device, stream )); //stream_data) ;
if (!C_iso)
{
// FIXME: why prefect C->x?
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
device, stream )); //stream_data) ;
}

//--------------------------------------------------------------------------
// copy Mp and Mh into C
//--------------------------------------------------------------------------

CHECK_CUDA_SIMPLE( cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t),
cudaMemcpyDefault, stream)) ;
//memcpy( C->p, M->p, (cnvec+1)* sizeof( int64_t) );
if (M_is_hyper)
{
// FIXME: this method does not yet handle the hypersparse case
Expand Down Expand Up @@ -207,7 +238,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
filecache.getFile (my_mxm_spec) ;

GBURBLE ("(GPU stringified) ") ;
GBURBLE ("(GPU stringified srcode = %lu)\n", my_mxm_spec.sr_code) ;

//--------------------------------------------------------------------------
// construct the tasks for phase1 and phase2
Expand All @@ -234,8 +265,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
Blockbucket = (int64_t*)
rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t));
Bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
Bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
Bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL
|| Bucket == NULL || offset == NULL)
{
Expand All @@ -246,16 +277,16 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method

// fixme: do async with streams
// FIXME: do we need any of these?
CHECK_CUDA_SIMPLE(cudaMemsetAsync(Nanobuckets, 0,
nanobuckets_size * sizeof(int64_t), stream));
CHECK_CUDA_SIMPLE(cudaMemsetAsync(Blockbucket, 0,
blockbuckets_size * sizeof(int64_t), stream));
// CHECK_CUDA_SIMPLE(cudaMemsetAsync(Nanobuckets, 0,
// nanobuckets_size * sizeof(int64_t), stream));
// CHECK_CUDA_SIMPLE(cudaMemsetAsync(Blockbucket, 0,
// blockbuckets_size * sizeof(int64_t), stream));
CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucketp, 0,
(NBUCKETS+1) * sizeof(int64_t), stream));
CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucket, 0,
mnz * sizeof(int64_t), stream));
CHECK_CUDA_SIMPLE(cudaMemsetAsync(offset, 0,
NBUCKETS * sizeof(int64_t), stream));
//CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucket, 0,
// mnz * sizeof(int64_t), stream));

//--------------------------------------------------------------------------
// phase1 and phase2: place each C(i,j) in a bucket
Expand Down Expand Up @@ -287,15 +318,19 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
device, stream )) ; //stream_data) ;
}

// prefetch C
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
device, stream )); //stream_data) ;
if (!C_iso)
{
// FIXME: why prefect C->x?
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
device, stream )); //stream_data) ;
}
// // prefetch C
// CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
// device, stream )); //stream_data) ;
// if (!C_iso)
// {
// // FIXME: why prefect C->x?
// CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
// device, stream )); //stream_data) ;
// }

//--------------------------------------------------------------------------
// Pre-fetch arrays that will be used on the device
//--------------------------------------------------------------------------

// prefetch A
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t),
Expand Down Expand Up @@ -327,56 +362,71 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
//--------------------------------------------------------------------------

GBURBLE ("(GPU phase1 start) ") ;

kernel_timer.Start();
p1lf.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B, stream);
CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
kernel_timer.Stop();

GBURBLE ("(GPU phase1 done) ") ;
GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;

//--------------------------------------------------------------------------
// phase2: cumsum across the blockbuckets, propagate to thread level
//--------------------------------------------------------------------------

GBURBLE ("(GPU phase1 start) ") ;
GBURBLE ("(GPU phase2 start nblk=%d ) ", ntasks) ;

kernel_timer.Start();
p2lf.jitGridBlockLaunch(Blockbucket, offset, M, stream);
kernel_timer.Stop();

CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));

int64_t s= offset[0];
C->nzombies = s;
bool all_in_one = false;
for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
{
Bucketp[bucket] = s;
s+= offset[bucket];
if ( (Bucketp[bucket] - Bucketp[bucket-1] ) == mnz ) all_in_one = true;
}

GBURBLE ("(GPU phase2 done) ") ;
GBURBLE ("(GPU phase2 done %12.6g ms )\n", kernel_timer.Elapsed()) ;

GBURBLE ("(GPU phase2end start) ") ;
if( !all_in_one)
{
GBURBLE ("(GPU phase2end start nblk=%d) ", ntasks) ;

p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
Bucketp, Bucket, offset, C, M, stream);
kernel_timer.Start();
p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
Bucketp, Bucket, offset, C, M, stream);

GBURBLE ("(GPU phase2end done) ") ;
CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
kernel_timer.Stop();
GBURBLE ("(GPU phase2end done %12.6g ms)\n",kernel_timer.Elapsed()) ;
}

//--------------------------------------------------------------------------
// phase3: do the numerical work
//--------------------------------------------------------------------------

CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));

for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
{
int64_t start = Bucketp[bucket];
int64_t end = Bucketp[bucket + 1 ];
//int64_t start = 0;
//int64_t end = cnz;

if(end - start > 0) {
// TODO: Use stream pool
phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket,
C, M, A, B, stream);
GBURBLE ("(GPU phase3 bucket %d done ) ", bucket) ;
}
GBURBLE ("(GPU phase3 bucket %d launch ) ", bucket) ;
kernel_timer.Start();
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B, stream);
CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream)); // only for timing
kernel_timer.Stop();
GBURBLE ("(GPU phase3 bucket %d done %12.6g ms)\n", bucket, kernel_timer.Elapsed()) ; }
}

GB_FREE_WORKSPACE ;
Expand Down
2 changes: 2 additions & 0 deletions CUDA/GB_cuda_atomics.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ template <typename T>
__device__ void atomic_sub(T* ptr, T val);

template<> __device__ __inline__ void atomic_add<int>(int* ptr, int val) { atomicAdd(ptr, val); }
template<> __device__ __inline__ void atomic_add<uint32_t>(uint32_t* ptr, uint32_t val) { atomicAdd((unsigned int*)ptr, (unsigned int)val); }
template<> __device__ __inline__ void atomic_add<int64_t>(int64_t* ptr, int64_t val) { atomicAdd((unsigned long long*)ptr, (unsigned long long)val); }
template<> __device__ __inline__ void atomic_add<uint64_t>(uint64_t* ptr, uint64_t val) { atomicAdd((unsigned long long*)ptr, (unsigned long long)val); }
template<> __device__ __inline__ void atomic_add<float>(float* ptr, float val) { atomicAdd(ptr, val); }
template<> __device__ __inline__ void atomic_add<double>(double* ptr, double val) { atomicAdd(ptr, val); }

Expand Down
57 changes: 31 additions & 26 deletions CUDA/GB_cuda_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@
#define GB_C_MULT( c, a, b)
#define GB_MULTADD( c, a ,b )
#define GB_DOT_TERMINAL ( c )
#define GB_DOT_MERGE \
#define GB_DOT_MERGE(pA,pB) \
{ \
cij_exists = true ; \
}
#define GB_CIJ_EXIST_POSTCHECK

#else

Expand All @@ -105,33 +106,37 @@
#define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
#define GB_DOT_TERMINAL ( c )
//# if ( c == TERMINAL_VALUE) break;
// cij += A(k,i) * B(k,j), for merge operation

#define GB_DOT_MERGE \
{ \
GB_GETA (aki, Ax, pA) ; /* aki = A(k,i) */ \
GB_GETB (bkj, Bx, pB) ; /* bkj = B(k,j) */ \
cij_exists = true ; \
GB_MULTADD (cij, aki, bkj) ; /* cij += aki * bkj */ \
}
#if GB_IS_PLUS_PAIR_REAL_SEMIRING

#if 0
#define GB_DOT_MERGE \
{ \
GB_GETA ( aki, Ax, pA) ; /* aki = A(k,i) */ \
GB_GETB ( bkj, Bx, pB) ; /* bkj = B(k,j) */ \
if (cij_exists) \
{ \
GB_MULTADD (cij, aki, bkj) ; /* cij += aki * bkj */ \
} \
else \
{ \
/* cij = A(k,i) * B(k,j), and add to the pattern */ \
cij_exists = true ; \
GB_C_MULT (cij, aki, bkj) ; /* cij = aki * bkj */ \
} \
}
#endif
// cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
#if GB_ZTYPE_IGNORE_OVERFLOW
// plus_pair for int64, uint64, float, or double
#define GB_DOT_MERGE(pA,pB) cij++ ;
#define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
#else
// plus_pair semiring for small integers
#define GB_DOT_MERGE(pA,pB) \
{ \
cij_exists = true ; \
cij++ ; \
}
#define GB_CIJ_EXIST_POSTCHECK
#endif

#else

// cij += A(k,i) * B(k,j), for merge operation (general case)
#define GB_DOT_MERGE(pA,pB) \
{ \
GB_GETA (aki, Ax, pA) ; /* aki = A(k,i) */ \
GB_GETB (bkj, Bx, pB) ; /* bkj = B(k,j) */ \
cij_exists = true ; \
GB_MULTADD (cij, aki, bkj) ; /* cij += aki * bkj */ \
}
#define GB_CIJ_EXIST_POSTCHECK

#endif

#endif

Expand Down
Loading

0 comments on commit 5e7c3ed

Please sign in to comment.