Skip to content

Commit

Permalink
Merge pull request #140 from DrTimothyAldenDavis/master
Browse files Browse the repository at this point in the history
Master
  • Loading branch information
DrTimothyAldenDavis authored Apr 26, 2022
2 parents 4bb44bb + 3f7827f commit 8881e16
Show file tree
Hide file tree
Showing 31 changed files with 2,892 additions and 169 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ endif ( )
set ( CMAKE_MACOSX_RPATH TRUE )

# version of SuiteSparse:GraphBLAS
set ( GraphBLAS_DATE "Apr 8, 2022" )
set ( GraphBLAS_DATE "Apr 25, 2022" )
set ( GraphBLAS_VERSION_MAJOR 7 )
set ( GraphBLAS_VERSION_MINOR 0 )
set ( GraphBLAS_VERSION_SUB 3 )
set ( GraphBLAS_VERSION_SUB 4 )

message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} " date: " ${GraphBLAS_DATE} )

Expand Down
35 changes: 17 additions & 18 deletions CUDA/GB_AxB_dot3_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
phase2endlaunchFactory p2elf;


// # of threads in phase1 and phase2 kernel launches must be the same
// # of threads in phase1 and phase2 kernel launches are related
// # by the size of the warp. ph2_task = ph1_task/32 for example
int nthrd = p2lf.get_threads_per_block();
int ntasks = p2elf.get_number_of_blocks(M);

Expand Down Expand Up @@ -267,21 +268,22 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method

GBURBLE ("(GPU phase1 done) ") ;

print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");
//print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
printf(" using %ld blockbuckets \n", blockbuckets_size);
//print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");

//----------------------------------------------------------------------
// phase2: cumsum across the blockbuckets, propagate to thread level
//----------------------------------------------------------------------

GBURBLE ("(GPU phase1 start) ") ;

p2lf.jitGridBlockLaunch(Blockbucket, offset, M);
p2lf.jitGridBlockLaunch(Blockbucket, offset, M );

int64_t s= 0;
for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
int64_t s= offset[0];
for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
{
Bucketp[bucket] = s;
Bucketp[bucket] = s;
s+= offset[bucket];
printf("bucketp[%d] = %ld, offset=%ld\n", bucket, Bucketp[bucket], offset[bucket]);
}
Expand All @@ -295,38 +297,35 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method

GBURBLE ("(GPU phase2end done) ") ;

print_array<int64_t>(Bucket, mnz , "Bucket");
print_array<int64_t>(M->i, mnz , "M->i");
print_array<int64_t>(C->i, mnz , "C->i");
//print_array<int64_t>(Bucket, mnz , "Bucket");
//print_array<int64_t>(M->i, mnz , "M->i");
//print_array<int64_t>(C->i, mnz , "C->i");

//----------------------------------------------------------------------
// phase3: do the numerical work
//----------------------------------------------------------------------

print_array<int64_t>(Bucketp, NBUCKETS + 1 , "Bucketp");
C->nzombies = Bucketp[1]; //set pre-zombie counts
printf("pre-kernel C->nzombies=%ld\n", C->nzombies);
printf("pre-phase3 kernel C->nzombies=%ld\n", C->nzombies);

for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
{
int64_t start = Bucketp[bucket];
int64_t end = Bucketp[bucket+1];

int64_t end = Bucketp[bucket + 1 ];

if(end - start > 0) {
printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
// TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
} else {
printf("Skipping bucket %d, no work to do\n", bucket);
}

GBURBLE ("(GPU phase3 done ) ") ;
}
C->nzombies += Bucketp[1];
printf("C->p[0]=%ld\n", C->p[0]);
printf("C->p[1]=%ld\n", C->p[1]);
//printf("C->p[0]=%ld\n", C->p[0]);
//printf("C->p[1]=%ld\n", C->p[1]);
printf("C->nzombies=%ld\n", C->nzombies);

GB_FREE_WORKSPACE ;
Expand Down
8 changes: 7 additions & 1 deletion CUDA/GB_cuda_semiring_factory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,15 @@ extern "C"
//std::istream* (*file_callback)(std::string, std::iostream&);

// Define a factory class for building any semiring text definitions

// FIXME: Rename to GrB_MxM_problem_spec and delegate problem generation to data factory
class GB_cuda_semiring_factory: public jit::File_Desc {

public:

uint32_t mask_ecode;
uint64_t sr_code;
bool mask_struct;
bool mask_comp;

// file ptr
FILE *fp;
Expand Down Expand Up @@ -55,6 +58,8 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
// input:
GrB_Semiring semiring, // the semiring to enumify
bool flipxy, // multiplier is: mult(a,b) or mult(b,a)

// FIXME: Just use GrB_Matrix here
GrB_Type ctype, // the type of C
GrB_Type mtype, // the type of M, or NULL if no mask
GrB_Type atype, // the type of A
Expand Down Expand Up @@ -109,6 +114,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
// operators, datatypes, sparsity formats and produces a character buffer.
//------------------------------------------------------------------------------

// FIXME: Also need mask code macrofication
void macrofy ( ) override
{
std::cout<<" calling macrofy semiring. sr_code="<< this->sr_code << std::endl;
Expand Down
2 changes: 0 additions & 2 deletions CUDA/GB_reduce_to_scalar_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ GrB_Info GB_reduce_to_scalar_cuda

GB_cuda_reduce( A, s, reduce);

printf("num_triangles = %d\n", s[0] );

return GrB_SUCCESS ;
}

39 changes: 27 additions & 12 deletions CUDA/jitFactory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ class phase1launchFactory
std::cout << "B TYpe: " << B->type << std::endl;
// // (1) create the semiring code and name

// // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
// // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
filecache.getFile (semiring_factory_) ;

Expand All @@ -162,6 +162,11 @@ class phase1launchFactory
dim3 grid(get_number_of_blocks(M));
dim3 block(get_threads_per_block());

// for (auto s:compiler_flags)
// {
// std::cout << "Compiler Flags: " << s << std::endl ;
// }

jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
string_to_be_jitted.str(),
header_names,
Expand Down Expand Up @@ -199,6 +204,13 @@ class phase2launchFactory
return (ntasks + threads_per_block - 1) / threads_per_block ;
}

int get_number_of_phase1_blocks( GrB_Matrix M){
const int64_t mnz = GB_nnz (M) ;
int number_of_sms = GB_Global_gpu_sm_get (0);
int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
return GB_IMIN( nblks, 128 * number_of_sms);
}

bool jitGridBlockLaunch(// parameters to AxB_phase2:
int64_t *blockBucket, int64_t *offset, GrB_Matrix M) {

Expand All @@ -224,7 +236,7 @@ class phase2launchFactory
.set_kernel_inst( kernel_name, {})
.configure(grid, block)
// parameters to AxB_phase2:
.launch( blockBucket, offset, get_number_of_blocks(M));
.launch( blockBucket, offset, get_number_of_phase1_blocks(M));

checkCudaErrors( cudaDeviceSynchronize() );
result= true;
Expand Down Expand Up @@ -319,9 +331,9 @@ class phase3launchFactory
//----------------------------------------------------------------------
// phase3: do the numerical work
//----------------------------------------------------------------------

C->jumbled = true;
C->nzombies = bucketp[1]; //set pre-zombie counts
const int64_t Cnz = GB_nnz (C) ;
const int64_t nz = end - start; // number of dots in this bucket
const int64_t mnvec = M->nvec ;

int gridsz, blocksz, sz = 4;
Expand All @@ -332,10 +344,13 @@ class phase3launchFactory
/**
* Configure geometry and kernel function name based on sparsity of C and number of vectors in M
*/
configure(Cnz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);

auto sr_code = std::to_string(semiring_factory_.sr_code);

std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
std::stringstream string_to_be_jitted ;
std::vector<std::string> template_types = {C->type->name, A->type->name, B->type->name};

jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
filecache.getFile (semiring_factory_) ;
Expand All @@ -347,17 +362,16 @@ class phase3launchFactory
dim3 grid(gridsz);
dim3 block(blocksz);

C->nzombies = 0;
GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
jit::launcher( hashable_name,
jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
string_to_be_jitted.str(),
header_names,
compiler_flags,
file_callback)
.set_kernel_inst(final_kernel_name_ss.str(),
{ C->type->name,
A->type->name,
B->type->name })
.set_kernel_inst(final_kernel_name_ss.str(), template_types )
// { C->type->name,
// A->type->name,
// B->type->name })
.configure(grid, block) //if commented, use implicit 1D configure in launch
.launch(
start, // input/output:
Expand Down Expand Up @@ -386,6 +400,7 @@ class phase3launchFactory
int number_of_sms = GB_Global_gpu_sm_get (0) ;

std::string Opname;
// TODO: make sure this works with different geometry

printf("LAUNCHING BUCKET CODE: %d\n", (int)bucket_code_);
switch (bucket_code_)
Expand Down Expand Up @@ -706,4 +721,4 @@ inline bool GB_cuda_reduce(GrB_Matrix A, void *output, GrB_Monoid op) {
//
//
#endif // C++11
#endif
#endif
31 changes: 17 additions & 14 deletions CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ T block_ReduceSum(thread_block g, T val, T Ident)

//tile.sync(); // Wait for all partial reductions

if (wid > 0 || gridDim.x == 1 ) return val;
if (wid > 0 ) return val;

//read from shared memory only if that warp existed
val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : Ident ;
Expand All @@ -88,7 +88,7 @@ __global__ void AxB_dot3_phase3_dndn
(
int64_t start,
int64_t end,
int64_t *Bucket,
int64_t *Bucket, // do the work in Bucket [start:end-1]
GrB_Matrix C,
GrB_Matrix M,
GrB_Matrix A,
Expand All @@ -108,6 +108,7 @@ __global__ void AxB_dot3_phase3_dndn

// zombie count
int zc = 0;
// dot pair and index in bucket
int64_t pair_id;

// total items to be inspected
Expand All @@ -116,26 +117,27 @@ __global__ void AxB_dot3_phase3_dndn
int s = blockDim.x;

// Main loop over pairs
for (pair_id = start + blockIdx.x; //warp per pair
pair_id < end;
pair_id += gridDim.x ){
for ( int64_t kk = start + blockIdx.x; //warp per pair
kk < end;
kk += gridDim.x ){

pair_id = Bucket [ kk ];
int64_t i = Mi[pair_id];
int64_t j = Ci[pair_id] >> 4;

int64_t pA = Ap[i];
int64_t xend = Ap[i+1];
int64_t pA = Ap[i];
int64_t xend = Ap[i+1];
nnzA = xend - pA;

int64_t pB = Bp[j];
int64_t yend = Bp[j+1];
int64_t pB = Bp[j];
int64_t yend = Bp[j+1];
nnzB = yend - pB;

if (threadIdx.x == 0 ){
printf("tid=%d, i,j = %d,%d nnzA= %d, nnzB=%d\n",
threadIdx.x, (int)i,(int)j, (int)nnzA, (int)nnzB);
}
__syncthreads();
if (threadIdx.x == 0 ){
printf("tid=%d, i,j = %d,%d nnzA= %d, nnzB=%d\n",
threadIdx.x, (int)i,(int)j, (int)nnzA, (int)nnzB);
}
__syncthreads();


// convert global data pointer to the local pointer of this block
Expand Down Expand Up @@ -170,6 +172,7 @@ __global__ void AxB_dot3_phase3_dndn
GB_PUTC( Ci[pair_id]=i ) ;
}
//__syncthreads ( ) ;
// FIXME: add atomics to sum up block zombies to C->nzombies
}

}
Expand Down
10 changes: 6 additions & 4 deletions CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ __global__ void AxB_dot3_phase3_mp
(
int64_t start,
int64_t end,
int64_t *Bucket,
int64_t *Bucket, // do the work in Bucket [start:end-1]
GrB_Matrix C,
GrB_Matrix M,
GrB_Matrix A,
Expand Down Expand Up @@ -120,11 +120,13 @@ __global__ void AxB_dot3_phase3_mp
// int has_zombies = 0 ;

// Main loop over pairs
for (pair_id = start+ blockIdx.x; //warp per pair
pair_id < end;
pair_id += gridDim.x )
int64_t kk ;
for (kk = start+ blockIdx.x; //warp per pair
kk < end;
kk += gridDim.x )
{

pair_id = Bucket [kk] ;
int64_t i = Mi[pair_id];
int64_t j = Ci[pair_id] >> 4;

Expand Down
16 changes: 8 additions & 8 deletions CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __global__ void AxB_dot3_phase3_spdn
(
int64_t start,
int64_t end,
int64_t *Bucket,
int64_t *Bucket, // do the work in Bucket [start:end-1]
GrB_Matrix C,
GrB_Matrix M,
GrB_Matrix A,
Expand Down Expand Up @@ -87,27 +87,27 @@ __global__ void AxB_dot3_phase3_spdn
for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
tid < dots;
tid += blockDim.x * gridDim.x) {
int pair_id, im;
int64_t kk, pair_id, im;
// if (threadIdx.x ==0)
// printf("thd%u pi=%lld\n",tid, start+threadIdx.x);
// __syncthreads();

for (pair_id = start+tid, im = 0;
im < m && pair_id < end;
++im, pair_id += dots ){
for (int64_t kk = start+tid, im = 0;
kk < end && im < m ;
kk += dots, ++im ){

pair_id = Bucket[ kk ] ;
int64_t i = Mi[pair_id]; // cols from mask

// TODO: column of Ci / 16?
int64_t j = Ci[pair_id] >> 4; // row number of C
int64_t j = Ci[pair_id] >> 4; // row number of C previously encoded in phase1

//printf("tid=%d, i=%lu, j=%lu\n", threadIdx.x, i, j);

// if (threadIdx.x ==0)
// printf("thd%u i,j=%lld,%lld\n",tid, i,j);
// __syncthreads();

// Prime row offsets for both A and B
// Prep row offsets for both A and B
int64_t pA = Ap[i]; // row of C
int64_t pA_end = Ap[i+1];
int64_t nnzA = pA_end - pA;
Expand Down
Loading

0 comments on commit 8881e16

Please sign in to comment.