Skip to content

Commit

Permalink
Merge pull request #144 from DrTimothyAldenDavis/master
Browse files Browse the repository at this point in the history
Master
  • Loading branch information
DrTimothyAldenDavis authored May 20, 2022
2 parents 8881e16 + 0528f93 commit 1bf1dde
Show file tree
Hide file tree
Showing 116 changed files with 9,891 additions and 40,793 deletions.
6 changes: 3 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ endif ( )
set ( CMAKE_MACOSX_RPATH TRUE )

# version of SuiteSparse:GraphBLAS
set ( GraphBLAS_DATE "Apr 25, 2022" )
set ( GraphBLAS_DATE "May 20, 2022" )
set ( GraphBLAS_VERSION_MAJOR 7 )
set ( GraphBLAS_VERSION_MINOR 0 )
set ( GraphBLAS_VERSION_SUB 4 )
set ( GraphBLAS_VERSION_MINOR 1 )
set ( GraphBLAS_VERSION_SUB 0 )

message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} " date: " ${GraphBLAS_DATE} )

Expand Down
1 change: 1 addition & 0 deletions CUDA/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*.so
jitFactory
stringify
rmm_log.txt

# Do not ignore this file
!.gitignore
Expand Down
39 changes: 23 additions & 16 deletions CUDA/GB_AxB_dot3_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,12 +139,17 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
int64_t cnz = mnz ;
int64_t cnvec = mnvec ;

int sparsity_M = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
int C_sparsity = M_sparsity ;
bool C_iso = false ;
info = GB_new_bix (&C, // sparse or hyper (from M), existing header
ctype, cvlen, cvdim, GB_Ap_malloc, true,
sparsity_M, false, M->hyper_switch, cnvec,
M_sparsity, false, M->hyper_switch, cnvec,
cnz+1, // add one to cnz for GB_cumsum of Cwork
true, /* not iso: */ false, Context) ;
true, C_iso, Context) ;

CHECK_CUDA_SIMPLE(cudaMemset(C->i, 0, (cnz+1) * sizeof(int64_t)));
CHECK_CUDA_SIMPLE(cudaMemset(C->x, 0, (cnz+1) * sizeof(ctype->size)));

if (info != GrB_SUCCESS)
{
Expand Down Expand Up @@ -174,7 +179,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method

C->magic = GB_MAGIC ;
C->nvec_nonempty = M->nvec_nonempty ;
C->nvec = M->nvec ;
// C->nvec = M->nvec ;
// the dot3 CUDA kernel will produce C->i with jumbled indices
C->jumbled = true ;

Expand All @@ -183,16 +188,15 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
// stringify the semiring and the mask
//--------------------------------------------------------------------------

GB_cuda_semiring_factory mysemiring = GB_cuda_semiring_factory ( ) ;
GB_cuda_mxm_factory my_mxm_spec = GB_cuda_mxm_factory ( ) ;

// (1) create the semiring code and name
mysemiring.semiring_factory ( semiring, flipxy,
ctype, M->type, A->type, B->type, Mask_struct, // matrix types
false, GB_sparsity(C), GB_sparsity(M), GB_sparsity(A), GB_sparsity(B) ) ;
// (1) create the mxm code and name
my_mxm_spec.mxm_factory ( C_iso, C_sparsity, ctype,
M, Mask_struct, false, semiring, flipxy, A, B) ;

// (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
// (2) ensure the jitifier has "GB_mxm_[my_mxm_spec.sr_code].h"
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
filecache.getFile (mysemiring) ;
filecache.getFile (my_mxm_spec) ;

GBURBLE ("(GPU stringified) ") ;
//--------------------------------------------------------------------------
Expand All @@ -201,7 +205,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method

// on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
// on the GPU:
phase1launchFactory p1lf(mysemiring);
phase1launchFactory p1lf(my_mxm_spec);
phase2launchFactory p2lf;
phase2endlaunchFactory p2elf;

Expand Down Expand Up @@ -233,26 +237,28 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));

offset = (int64_t*)rmm_wrap_malloc( (NBUCKETS)*sizeof(int64_t)) ;
CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));

memset( offset, 0, NBUCKETS * sizeof(int64_t) );

//--------------------------------------------------------------------------
// Pre-fetch arrays that will be used on the device
//--------------------------------------------------------------------------

CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL)) ; //stream_data) ;
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
// FIXME: if Mask_struct is true, skip this:
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL )) ; //stream_data) ;

CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t), device, NULL )); //stream_data) ;
// FIXME: skip if C iso:
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size, device, NULL )); //stream_data) ;
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL)); // stream_data) ;
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
// FIXME: skip if A iso:
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL )) ; //stream_data) ;
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL)); //stream_data) ;
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL )); //stream_data) ;
// FIXME: skip if B iso:
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL )); //stream_data) ;

// The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
Expand Down Expand Up @@ -281,6 +287,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
p2lf.jitGridBlockLaunch(Blockbucket, offset, M );

int64_t s= offset[0];
C->nzombies = s;
for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
{
Bucketp[bucket] = s;
Expand Down Expand Up @@ -316,7 +323,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
if(end - start > 0) {
printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
// TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
} else {
printf("Skipping bucket %d, no work to do\n", bucket);
Expand Down
1 change: 0 additions & 1 deletion CUDA/GB_AxB_dot3_cuda_branch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ bool GB_AxB_dot3_cuda_branch
GBURBLE (" work:%g GPUs:%d ", work, ngpus_to_use) ;
if (ngpus_to_use > 0
// FIXME: FUTURE: user-defined types and operators
// && (semiring->header_size == 0) // semiring is built-in
&& (A->type->code != GB_UDT_code)
&& (B->type->code != GB_UDT_code)
// FIXME: M could be hypersparse. we should handle this
Expand Down
4 changes: 2 additions & 2 deletions CUDA/GB_Matrix_allocate.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

#ifndef GB_MATRIX_ALLOCATE_H
#define GB_MATRIX_ALLOCATE_H
#include "matrix.h"
#include "pmr_malloc.h"
#include "GB_cuda_kernel.h"
#include "rmm_wrap.h"

#ifdef __cplusplus
extern "C" {
Expand Down
2 changes: 1 addition & 1 deletion CUDA/GB_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ extern "C"
#include "cuda_runtime.h"
#include "cuda.h"
#include "jitify.hpp"
#include "GB_cuda_semiring_factory.hpp"
#include "GB_cuda_mxm_factory.hpp"

#include <cassert>
#include <cmath>
Expand Down
237 changes: 237 additions & 0 deletions CUDA/GB_cuda_kernel.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
//------------------------------------------------------------------------------
// CUDA/GB_cuda_kernel.h: definitions for all GraphBLAS CUDA kernels
//------------------------------------------------------------------------------

// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// This file is #include'd into all CUDA kernels for GraphBLAS. It provides
// a

#pragma once
#undef ASSERT
#define ASSERT(x)

//------------------------------------------------------------------------------
// TODO: this will be in the jit code:
#define chunksize 128

//------------------------------------------------------------------------------
// GETA, GETB: get entries from input matrices A and B
//------------------------------------------------------------------------------

#if GB_FLIPXY

#if GB_A_IS_PATTERN
#define GB_DECLAREA(aval)
#define GB_SHAREDA(aval)
#define GB_GETA( aval, ax, p)
#else
#define GB_DECLAREA(aval) T_Y aval
#define GB_SHAREDA(aval) __shared__ T_Y aval
#if GB_A_ISO
#define GB_GETA( aval, ax, p) aval = (T_Y) (ax [0]) ;
#else
#define GB_GETA( aval, ax, p) aval = (T_Y) (ax [p]) ;
#endif
#endif

#if GB_B_IS_PATTERN
#define GB_DECLAREB(bval)
#define GB_SHAREDB(bval)
#define GB_GETB( bval, bx, p)
#else
#define GB_DECLAREB(bval) T_X bval
#define GB_SHAREDB(bval) __shared__ T_X bval
#if GB_B_ISO
#define GB_GETB( bval, bx, p) bval = (T_X) (bx [0]) ;
#else
#define GB_GETB( bval, bx, p) bval = (T_X) (bx [p]) ;
#endif
#endif

#else

#if GB_A_IS_PATTERN
#define GB_DECLAREA(aval)
#define GB_SHAREDA(aval)
#define GB_GETA( aval, ax, p)
#else
#define GB_DECLAREA(aval) T_X aval
#define GB_SHAREDA(aval) __shared__ T_X aval
#if GB_A_ISO
#define GB_GETA( aval, ax, p) aval = (T_X) (ax [0]) ;
#else
#define GB_GETA( aval, ax, p) aval = (T_X) (ax [p]) ;
#endif
#endif

#if GB_B_IS_PATTERN
#define GB_DECLAREB(bval)
#define GB_SHAREDB(bval)
#define GB_GETB( bval, bx, p)
#else
#define GB_DECLAREB(bval) T_Y bval
#define GB_SHAREDB(bval) __shared__ T_Y bval
#if GB_B_ISO
#define GB_GETB( bval, bx, p) bval = (T_Y) (bx [0]) ;
#else
#define GB_GETB( bval, bx, p) bval = (T_Y) (bx [p]) ;
#endif
#endif

#endif

//------------------------------------------------------------------------------
// operators
//------------------------------------------------------------------------------

#if GB_C_ISO

#define GB_ADD_F( f , s)
#define GB_C_MULT( c, a, b)
#define GB_MULTADD( c, a ,b )
#define GB_DOT_TERMINAL ( c )
#define GB_DOT_MERGE \
{ \
cij_exists = true ; \
}

#else

#define GB_ADD_F( f , s) f = GB_ADD ( f, s )
#define GB_C_MULT( c, a, b) c = GB_MULT( (a), (b) )
#define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
#define GB_DOT_TERMINAL ( c )
//# if ( c == TERMINAL_VALUE) break;
// cij += A(k,i) * B(k,j), for merge operation

#define GB_DOT_MERGE \
{ \
GB_GETA ( aki, Ax, pA) ; /* aki = A(k,i) */ \
GB_GETB ( bkj, Bx, pB) ; /* bkj = B(k,j) */ \
if (cij_exists) \
{ \
GB_MULTADD (cij, aki, bkj) ; /* cij += aki * bkj */ \
} \
else \
{ \
/* cij = A(k,i) * B(k,j), and add to the pattern */ \
cij_exists = true ; \
GB_C_MULT (cij, aki, bkj) ; /* cij = aki * bkj */ \
} \
}

#endif

//------------------------------------------------------------------------------
// subset of GraphBLAS.h
//------------------------------------------------------------------------------

#ifndef GRAPHBLAS_H
#define GRAPHBLAS_H

#undef restrict
#undef GB_restrict
#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
#define GB_restrict __restrict__
#else
#define GB_restrict
#endif
#define restrict GB_restrict

#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include <string.h>

// GB_STR: convert the content of x into a string "x"
#define GB_XSTR(x) GB_STR(x)
#define GB_STR(x) #x

#undef GB_PUBLIC
#define GB_PUBLIC extern
#undef GxB_MAX_NAME_LEN
#define GxB_MAX_NAME_LEN 128

typedef uint64_t GrB_Index ;
typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
typedef struct GB_Type_opaque *GrB_Type ;
typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
typedef struct GB_Monoid_opaque *GrB_Monoid ;
typedef struct GB_Semiring_opaque *GrB_Semiring ;
typedef struct GB_Scalar_opaque *GrB_Scalar ;
typedef struct GB_Vector_opaque *GrB_Vector ;
typedef struct GB_Matrix_opaque *GrB_Matrix ;

#define GxB_HYPERSPARSE 1 // store matrix in hypersparse form
#define GxB_SPARSE 2 // store matrix as sparse form (compressed vector)
#define GxB_BITMAP 4 // store matrix as a bitmap
#define GxB_FULL 8 // store matrix as full; all entries must be present

typedef void (*GxB_unary_function) (void *, const void *) ;
typedef void (*GxB_binary_function) (void *, const void *, const void *) ;

typedef bool (*GxB_select_function) // return true if A(i,j) is kept
(
GrB_Index i, // row index of A(i,j)
GrB_Index j, // column index of A(i,j)
const void *x, // value of A(i,j)
const void *thunk // optional input for select function
) ;

typedef void (*GxB_index_unary_function)
(
void *z, // output value z, of type ztype
const void *x, // input value x of type xtype; value of v(i) or A(i,j)
GrB_Index i, // row index of A(i,j)
GrB_Index j, // column index of A(i,j), or zero for v(i)
const void *y // input scalar y
) ;

typedef enum
{
// for all GrB_Descriptor fields:
GxB_DEFAULT = 0, // default behavior of the method

// for GrB_OUTP only:
GrB_REPLACE = 1, // clear the output before assigning new values to it

// for GrB_MASK only:
GrB_COMP = 2, // use the structural complement of the input
GrB_SCMP = 2, // same as GrB_COMP (historical; use GrB_COMP instead)
GrB_STRUCTURE = 4, // use the only pattern of the mask, not its values

// for GrB_INP0 and GrB_INP1 only:
GrB_TRAN = 3, // use the transpose of the input

// for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
GxB_GPU_ALWAYS = 2001,
GxB_GPU_NEVER = 2002,

// for GxB_AxB_METHOD only:
GxB_AxB_GUSTAVSON = 1001, // gather-scatter saxpy method
GxB_AxB_DOT = 1003, // dot product
GxB_AxB_HASH = 1004, // hash-based saxpy method
GxB_AxB_SAXPY = 1005 // saxpy method (any kind)
}
GrB_Desc_Value ;

#include "GB_opaque.h"
#endif

//------------------------------------------------------------------------------
// subset of GB.h
//------------------------------------------------------------------------------

#include "GB_imin.h"
#include "GB_zombie.h"
#include "GB_nnz.h"
#include "GB_partition.h"
#include "GB_binary_search.h"
#include "GB_search_for_vector_template.c"

Loading

0 comments on commit 1bf1dde

Please sign in to comment.