Merge pull request #140 from DrTimothyAldenDavis/master

Master
DrTimothyAldenDavis · Apr 26, 2022 · 8881e16 · 8881e16
2 parents 4bb44bb + 3f7827f
commit 8881e16
Show file tree

Hide file tree

Showing 31 changed files with 2,892 additions and 169 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,10 +26,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Apr 8, 2022" )
+set ( GraphBLAS_DATE "Apr 25, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
 set ( GraphBLAS_VERSION_MINOR 0 )
-set ( GraphBLAS_VERSION_SUB   3 )
+set ( GraphBLAS_VERSION_SUB   4 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 

diff --git a/CUDA/GB_AxB_dot3_cuda.cpp b/CUDA/GB_AxB_dot3_cuda.cpp
@@ -206,7 +206,8 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     phase2endlaunchFactory p2elf;
 
 
-    // # of threads in phase1 and phase2 kernel launches must be the same
+    // # of threads in phase1 and phase2 kernel launches are related
+    // # by the size of the warp.  ph2_task = ph1_task/32 for example
     int nthrd = p2lf.get_threads_per_block();
     int ntasks = p2elf.get_number_of_blocks(M);
 
@@ -267,21 +268,22 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 
     GBURBLE ("(GPU phase1 done) ") ;
 
-    print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
-    print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");
+    //print_array<int64_t>(Nanobuckets, nanobuckets_size, "Nanobuckets");
+    printf(" using %ld blockbuckets \n", blockbuckets_size); 
+    //print_array<int64_t>(Blockbucket, blockbuckets_size , "Blockbucket");
 
     //----------------------------------------------------------------------
     // phase2: cumsum across the blockbuckets, propagate to thread level
     //----------------------------------------------------------------------
 
     GBURBLE ("(GPU phase1 start) ") ;
 
-    p2lf.jitGridBlockLaunch(Blockbucket, offset, M);
+    p2lf.jitGridBlockLaunch(Blockbucket, offset, M );
 
-    int64_t s= 0;
-    for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
+    int64_t s= offset[0];
+    for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
     {
-        Bucketp[bucket] = s;
+        Bucketp[bucket] = s; 
         s+= offset[bucket];
         printf("bucketp[%d] = %ld, offset=%ld\n", bucket, Bucketp[bucket], offset[bucket]);
     }
@@ -295,38 +297,35 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 
     GBURBLE ("(GPU phase2end done) ") ;
 
-    print_array<int64_t>(Bucket, mnz , "Bucket");
-    print_array<int64_t>(M->i, mnz , "M->i");
-    print_array<int64_t>(C->i, mnz , "C->i");
+    //print_array<int64_t>(Bucket, mnz , "Bucket");
+    //print_array<int64_t>(M->i, mnz , "M->i");
+    //print_array<int64_t>(C->i, mnz , "C->i");
 
     //----------------------------------------------------------------------
     // phase3: do the numerical work
     //----------------------------------------------------------------------
 
     print_array<int64_t>(Bucketp, NBUCKETS + 1 , "Bucketp");
-    C->nzombies = Bucketp[1];  //set pre-zombie counts
-    printf("pre-kernel C->nzombies=%ld\n", C->nzombies);
+    printf("pre-phase3 kernel C->nzombies=%ld\n", C->nzombies);
 
     for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
     {
         int64_t start = Bucketp[bucket];
-        int64_t end = Bucketp[bucket+1];
-
+        int64_t end   = Bucketp[bucket + 1 ];
 
         if(end - start > 0) {
             printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
             // TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
             phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
-            p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C,  M, A, B);
+            p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
         } else {
             printf("Skipping bucket %d, no work to do\n", bucket);
         }
 
         GBURBLE ("(GPU phase3 done ) ") ;
     }
-    C->nzombies += Bucketp[1];
-    printf("C->p[0]=%ld\n", C->p[0]);
-    printf("C->p[1]=%ld\n", C->p[1]);
+    //printf("C->p[0]=%ld\n", C->p[0]);
+    //printf("C->p[1]=%ld\n", C->p[1]);
     printf("C->nzombies=%ld\n", C->nzombies);
 
     GB_FREE_WORKSPACE ;

diff --git a/CUDA/GB_cuda_semiring_factory.hpp b/CUDA/GB_cuda_semiring_factory.hpp
@@ -22,12 +22,15 @@ extern "C"
 //std::istream* (*file_callback)(std::string, std::iostream&);
 
 // Define a factory class for building any semiring text definitions
+
+// FIXME: Rename to GrB_MxM_problem_spec and delegate problem generation to data factory
 class GB_cuda_semiring_factory: public jit::File_Desc {
 
     public:
-
+        uint32_t mask_ecode;
         uint64_t sr_code;
         bool mask_struct;
+        bool mask_comp;
 
         // file ptr
         FILE *fp;
@@ -55,6 +58,8 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
         // input:
         GrB_Semiring semiring,  // the semiring to enumify
         bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+
+        // FIXME: Just use GrB_Matrix here
         GrB_Type ctype,         // the type of C
         GrB_Type mtype,         // the type of M, or NULL if no mask
         GrB_Type atype,         // the type of A
@@ -109,6 +114,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
 // operators, datatypes, sparsity formats and produces a character buffer. 
 //------------------------------------------------------------------------------
 
+    // FIXME: Also need mask code macrofication
     void macrofy ( ) override
     {
        std::cout<<" calling macrofy semiring. sr_code="<< this->sr_code << std::endl;

diff --git a/CUDA/GB_reduce_to_scalar_cuda.cpp b/CUDA/GB_reduce_to_scalar_cuda.cpp
@@ -37,8 +37,6 @@ GrB_Info GB_reduce_to_scalar_cuda
 
     GB_cuda_reduce( A, s, reduce);
 
-    printf("num_triangles = %d\n",  s[0] );
-
     return GrB_SUCCESS ;
 }
 
diff --git a/CUDA/jitFactory.hpp b/CUDA/jitFactory.hpp
@@ -142,7 +142,7 @@ class phase1launchFactory
     std::cout << "B TYpe: " << B->type << std::endl;
 //    // (1) create the semiring code and name
 
-    //    // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
+//    // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
     filecache.getFile (semiring_factory_) ;
 
@@ -162,6 +162,11 @@ class phase1launchFactory
     dim3 grid(get_number_of_blocks(M));
     dim3 block(get_threads_per_block());
 
+//  for (auto s:compiler_flags)
+//  {
+//      std::cout << "Compiler Flags: " << s << std::endl ;
+//  }
+
     jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
                    string_to_be_jitted.str(),
                    header_names,
@@ -199,6 +204,13 @@ class phase2launchFactory
     return (ntasks + threads_per_block - 1) / threads_per_block ;
   }
 
+  int get_number_of_phase1_blocks( GrB_Matrix M){
+    const int64_t mnz = GB_nnz (M) ;
+    int number_of_sms = GB_Global_gpu_sm_get (0);
+    int nblks = ( GB_nnz (M) + chunk_size - 1)/chunk_size;
+    return GB_IMIN( nblks,  128 * number_of_sms);
+  }
+
   bool jitGridBlockLaunch(// parameters to AxB_phase2:
                           int64_t *blockBucket, int64_t *offset, GrB_Matrix M) {
 
@@ -224,7 +236,7 @@ class phase2launchFactory
                    .set_kernel_inst( kernel_name, {})
                    .configure(grid, block)
                    // parameters to AxB_phase2:
-                   .launch( blockBucket, offset, get_number_of_blocks(M));
+                   .launch( blockBucket, offset, get_number_of_phase1_blocks(M));
 
       checkCudaErrors( cudaDeviceSynchronize() );
       result= true;
@@ -319,9 +331,9 @@ class phase3launchFactory
     //----------------------------------------------------------------------
     // phase3: do the numerical work
     //----------------------------------------------------------------------
+
     C->jumbled = true;
-    C->nzombies = bucketp[1];  //set pre-zombie counts
-    const int64_t Cnz = GB_nnz (C) ;
+    const int64_t nz = end - start; // number of dots in this bucket  
     const int64_t mnvec = M->nvec ;
 
     int gridsz, blocksz, sz = 4;
@@ -332,10 +344,13 @@ class phase3launchFactory
     /**
      * Configure geometry and kernel function name based on sparsity of C and number of vectors in M
      */
-    configure(Cnz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
+    configure( nz, mnvec, final_kernel_name_ss, blocksz, gridsz, sz);
+
+    auto sr_code = std::to_string(semiring_factory_.sr_code);
 
     std::string hashable_name = base_name + "_" + final_kernel_name_ss.str();
     std::stringstream string_to_be_jitted ;
+    std::vector<std::string> template_types = {C->type->name, A->type->name, B->type->name};
 
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
     filecache.getFile (semiring_factory_) ;
@@ -347,17 +362,16 @@ class phase3launchFactory
     dim3 grid(gridsz);
     dim3 block(blocksz);
 
-    C->nzombies = 0;
     GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
-    jit::launcher( hashable_name,
+    jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
                    string_to_be_jitted.str(),
                    header_names,
                    compiler_flags,
                    file_callback)
-               .set_kernel_inst(final_kernel_name_ss.str(),
-                                { C->type->name,
-                                  A->type->name,
-                                  B->type->name })
+               .set_kernel_inst(final_kernel_name_ss.str(), template_types )
+                               // { C->type->name,
+                               //   A->type->name,
+                               //   B->type->name })
                .configure(grid, block) //if commented, use implicit 1D configure in launch
                .launch(
                         start,             // input/output:
@@ -386,6 +400,7 @@ class phase3launchFactory
     int number_of_sms = GB_Global_gpu_sm_get (0) ;
 
     std::string Opname;
+    // TODO: make sure this works with different geometry
 
     printf("LAUNCHING BUCKET CODE: %d\n", (int)bucket_code_);
     switch (bucket_code_)
@@ -706,4 +721,4 @@ inline bool GB_cuda_reduce(GrB_Matrix A, void *output, GrB_Monoid op) {
 //
 //
 #endif  // C++11
-#endif
+#endif
diff --git a/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh b/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cuh
@@ -72,7 +72,7 @@ T block_ReduceSum(thread_block g, T val, T Ident)
 
   //tile.sync();                    // Wait for all partial reductions
 
-  if (wid > 0 || gridDim.x == 1 ) return val;
+  if (wid > 0 ) return val;
 
   //read from shared memory only if that warp existed
   val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] :  Ident  ;
@@ -88,7 +88,7 @@ __global__ void AxB_dot3_phase3_dndn
 (
     int64_t start,
     int64_t end,
-    int64_t *Bucket,
+    int64_t *Bucket,    // do the work in Bucket [start:end-1]
     GrB_Matrix C,
     GrB_Matrix M,
     GrB_Matrix A,
@@ -108,6 +108,7 @@ __global__ void AxB_dot3_phase3_dndn
 
     // zombie count
     int zc = 0;
+    // dot pair and index in bucket
     int64_t pair_id;
 
     // total items to be inspected
@@ -116,26 +117,27 @@ __global__ void AxB_dot3_phase3_dndn
     int s = blockDim.x;
 
     // Main loop over pairs 
-    for (pair_id = start + blockIdx.x; //warp per pair 
-         pair_id < end;  
-         pair_id += gridDim.x ){
+    for ( int64_t kk  = start + blockIdx.x; //warp per pair 
+                  kk  < end;  
+                  kk += gridDim.x ){
 
+         pair_id = Bucket [ kk ];
          int64_t i = Mi[pair_id];
          int64_t j = Ci[pair_id] >> 4;
 
-         int64_t pA = Ap[i];
-         int64_t xend   = Ap[i+1];
+         int64_t pA   = Ap[i];
+         int64_t xend = Ap[i+1];
          nnzA = xend - pA;
 
-         int64_t pB = Bp[j];
-         int64_t yend   = Bp[j+1];
+         int64_t pB   = Bp[j];
+         int64_t yend = Bp[j+1];
          nnzB = yend - pB;
 
-    if (threadIdx.x == 0 ){
-        printf("tid=%d, i,j = %d,%d  nnzA= %d, nnzB=%d\n",
-               threadIdx.x, (int)i,(int)j,  (int)nnzA, (int)nnzB);
-    }
-    __syncthreads();
+        if (threadIdx.x == 0 ){
+            printf("tid=%d, i,j = %d,%d  nnzA= %d, nnzB=%d\n",
+                   threadIdx.x, (int)i,(int)j,  (int)nnzA, (int)nnzB);
+        }
+        __syncthreads();
 
 
     // convert global data pointer to the local pointer of this block
@@ -170,6 +172,7 @@ __global__ void AxB_dot3_phase3_dndn
        GB_PUTC( Ci[pair_id]=i ) ;
     }
     //__syncthreads ( ) ;
+    // FIXME: add atomics to sum up block zombies to C->nzombies
   }
 
 }

diff --git a/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh b/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cuh
@@ -77,7 +77,7 @@ __global__ void AxB_dot3_phase3_mp
 (
     int64_t start,
     int64_t end,
-    int64_t *Bucket,
+    int64_t *Bucket,    // do the work in Bucket [start:end-1]
     GrB_Matrix C,
     GrB_Matrix M,
     GrB_Matrix A,
@@ -120,11 +120,13 @@ __global__ void AxB_dot3_phase3_mp
     // int has_zombies = 0 ;
 
     // Main loop over pairs 
-    for (pair_id = start+ blockIdx.x; //warp per pair 
-         pair_id < end;  
-         pair_id += gridDim.x )
+    int64_t kk ;
+    for (kk = start+ blockIdx.x; //warp per pair 
+         kk < end;  
+         kk += gridDim.x )
     {
 
+         pair_id = Bucket [kk] ;
          int64_t i = Mi[pair_id];
          int64_t j = Ci[pair_id] >> 4;
 

diff --git a/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh b/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cuh
@@ -50,7 +50,7 @@ __global__ void AxB_dot3_phase3_spdn
 ( 
   int64_t start, 
   int64_t end,
-  int64_t *Bucket, 
+  int64_t *Bucket,  // do the work in Bucket [start:end-1]
   GrB_Matrix C, 
   GrB_Matrix M, 
   GrB_Matrix A, 
@@ -87,27 +87,27 @@ __global__ void AxB_dot3_phase3_spdn
    for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
              tid < dots;
              tid += blockDim.x * gridDim.x) {
-      int pair_id, im; 
+      int64_t kk, pair_id, im; 
 //       if (threadIdx.x ==0)
 //         printf("thd%u pi=%lld\n",tid, start+threadIdx.x);
 //       __syncthreads();
 
-      for (pair_id = start+tid, im = 0; 
-           im < m && pair_id < end;  
-           ++im,     pair_id += dots ){
+      for (int64_t kk = start+tid, im = 0; 
+                   kk < end && im < m  ;  
+                   kk += dots,  ++im     ){
 
+         pair_id = Bucket[ kk ] ;
          int64_t i = Mi[pair_id];  // cols from mask
 
-         // TODO: column of Ci / 16?
-         int64_t j = Ci[pair_id] >> 4;  // row number of C
+         int64_t j = Ci[pair_id] >> 4;  // row number of C previously encoded in phase1
 
          //printf("tid=%d, i=%lu, j=%lu\n", threadIdx.x, i, j);
 
 //      if (threadIdx.x ==0)
 //         printf("thd%u i,j=%lld,%lld\n",tid, i,j);
 //      __syncthreads();
 
-          // Prime row offsets for both A and B
+          // Prep row offsets for both A and B
           int64_t pA       = Ap[i];   // row of C
           int64_t pA_end   = Ap[i+1];
           int64_t nnzA   = pA_end - pA;