Merge pull request #144 from DrTimothyAldenDavis/master

Master
DrTimothyAldenDavis · May 20, 2022 · 1bf1dde · 1bf1dde
2 parents 8881e16 + 0528f93
commit 1bf1dde
Show file tree

Hide file tree

Showing 116 changed files with 9,891 additions and 40,793 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,10 +26,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Apr 25, 2022" )
+set ( GraphBLAS_DATE "May 20, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
-set ( GraphBLAS_VERSION_MINOR 0 )
-set ( GraphBLAS_VERSION_SUB   4 )
+set ( GraphBLAS_VERSION_MINOR 1 )
+set ( GraphBLAS_VERSION_SUB   0 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 

diff --git a/CUDA/.gitignore b/CUDA/.gitignore
@@ -4,6 +4,7 @@
 *.so
 jitFactory
 stringify
+rmm_log.txt
 
 # Do not ignore this file
 !.gitignore

diff --git a/CUDA/GB_AxB_dot3_cuda.cpp b/CUDA/GB_AxB_dot3_cuda.cpp
@@ -139,12 +139,17 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     int64_t cnz = mnz ;
     int64_t cnvec = mnvec ;
 
-    int sparsity_M = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    int C_sparsity = M_sparsity ;
+    bool C_iso = false ;
     info = GB_new_bix (&C, // sparse or hyper (from M), existing header
         ctype, cvlen, cvdim, GB_Ap_malloc, true,
-        sparsity_M, false, M->hyper_switch, cnvec,
+        M_sparsity, false, M->hyper_switch, cnvec,
         cnz+1,  // add one to cnz for GB_cumsum of Cwork 
-        true, /* not iso: */ false, Context) ;
+        true, C_iso, Context) ;
+
+    CHECK_CUDA_SIMPLE(cudaMemset(C->i, 0, (cnz+1) * sizeof(int64_t)));
+    CHECK_CUDA_SIMPLE(cudaMemset(C->x, 0, (cnz+1) * sizeof(ctype->size)));
 
     if (info != GrB_SUCCESS)
     { 
@@ -174,7 +179,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 
     C->magic = GB_MAGIC ;
     C->nvec_nonempty = M->nvec_nonempty ;
-    C->nvec = M->nvec ;
+//    C->nvec = M->nvec ;
     // the dot3 CUDA kernel will produce C->i with jumbled indices
     C->jumbled = true ;
 
@@ -183,16 +188,15 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     // stringify the semiring and the mask
     //--------------------------------------------------------------------------
 
-    GB_cuda_semiring_factory mysemiring = GB_cuda_semiring_factory ( ) ;
+    GB_cuda_mxm_factory my_mxm_spec = GB_cuda_mxm_factory ( ) ;
 
-    // (1) create the semiring code and name
-    mysemiring.semiring_factory ( semiring, flipxy,
-        ctype, M->type, A->type, B->type, Mask_struct,  // matrix types
-        false, GB_sparsity(C), GB_sparsity(M), GB_sparsity(A), GB_sparsity(B) ) ;
+    // (1) create the mxm code and name
+    my_mxm_spec.mxm_factory ( C_iso, C_sparsity, ctype,
+        M, Mask_struct, false, semiring, flipxy, A, B) ;
 
-    // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
+    // (2) ensure the jitifier has "GB_mxm_[my_mxm_spec.sr_code].h"
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
-    filecache.getFile (mysemiring) ;
+    filecache.getFile (my_mxm_spec) ;
 
     GBURBLE ("(GPU stringified) ") ;
     //--------------------------------------------------------------------------
@@ -201,7 +205,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 
     // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
     // on the GPU:
-    phase1launchFactory p1lf(mysemiring);
+    phase1launchFactory p1lf(my_mxm_spec);
     phase2launchFactory p2lf;
     phase2endlaunchFactory p2elf;
 
@@ -233,26 +237,28 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
     CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));
 
-    offset = (int64_t*)rmm_wrap_malloc( (NBUCKETS)*sizeof(int64_t)) ;
     CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
     CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));
 
-    memset( offset, 0, NBUCKETS * sizeof(int64_t) );
-
     //--------------------------------------------------------------------------
     // Pre-fetch arrays that will be used on the device
     //--------------------------------------------------------------------------
 
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL)) ; //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
+    // FIXME: if Mask_struct is true, skip this:
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL )) ; //stream_data) ;
+
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t), device, NULL )); //stream_data) ;
+    // FIXME: skip if C iso:
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size, device, NULL )); //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL)); // stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
+    // FIXME: skip if A iso:
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL )) ; //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL)); //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL )); //stream_data) ;
+    // FIXME: skip if B iso:
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL )); //stream_data) ;
 
     // The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
@@ -281,6 +287,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     p2lf.jitGridBlockLaunch(Blockbucket, offset, M );
 
     int64_t s= offset[0];
+    C->nzombies = s;
     for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
     {
         Bucketp[bucket] = s; 
@@ -316,7 +323,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
         if(end - start > 0) {
             printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
             // TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
-            phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
+            phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
             p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
         } else {
             printf("Skipping bucket %d, no work to do\n", bucket);

diff --git a/CUDA/GB_AxB_dot3_cuda_branch.cpp b/CUDA/GB_AxB_dot3_cuda_branch.cpp
@@ -41,7 +41,6 @@ bool GB_AxB_dot3_cuda_branch
         GBURBLE (" work:%g GPUs:%d ", work, ngpus_to_use) ;
         if (ngpus_to_use > 0
             // FIXME: FUTURE: user-defined types and operators
-//            && (semiring->header_size == 0)     // semiring is built-in
             && (A->type->code != GB_UDT_code)
             && (B->type->code != GB_UDT_code)
             // FIXME: M could be hypersparse.  we should handle this

diff --git a/CUDA/GB_Matrix_allocate.h b/CUDA/GB_Matrix_allocate.h
@@ -1,8 +1,8 @@
 
 #ifndef GB_MATRIX_ALLOCATE_H
 #define GB_MATRIX_ALLOCATE_H
-#include "matrix.h"
-#include "pmr_malloc.h"
+#include "GB_cuda_kernel.h"
+#include "rmm_wrap.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/CUDA/GB_cuda.h b/CUDA/GB_cuda.h
@@ -21,7 +21,7 @@ extern "C"
 #include "cuda_runtime.h"
 #include "cuda.h"
 #include "jitify.hpp"
-#include "GB_cuda_semiring_factory.hpp"
+#include "GB_cuda_mxm_factory.hpp"
 
 #include <cassert>
 #include <cmath>

diff --git a/CUDA/GB_cuda_kernel.h b/CUDA/GB_cuda_kernel.h
@@ -0,0 +1,237 @@
+//------------------------------------------------------------------------------
+// CUDA/GB_cuda_kernel.h: definitions for all GraphBLAS CUDA kernels
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This file is #include'd into all CUDA kernels for GraphBLAS.  It provides
+// a
+
+#pragma once
+#undef  ASSERT
+#define ASSERT(x)
+
+//------------------------------------------------------------------------------
+// TODO: this will be in the jit code:
+#define chunksize 128 
+
+//------------------------------------------------------------------------------
+// GETA, GETB: get entries from input matrices A and B
+//------------------------------------------------------------------------------
+
+#if GB_FLIPXY
+
+    #if GB_A_IS_PATTERN
+        #define GB_DECLAREA(aval)
+        #define GB_SHAREDA(aval)
+        #define GB_GETA( aval, ax, p)
+    #else
+        #define GB_DECLAREA(aval) T_Y aval
+        #define GB_SHAREDA(aval) __shared__ T_Y aval
+        #if GB_A_ISO
+            #define GB_GETA( aval, ax, p) aval = (T_Y) (ax [0]) ;
+        #else
+            #define GB_GETA( aval, ax, p) aval = (T_Y) (ax [p]) ;
+        #endif
+    #endif
+
+    #if GB_B_IS_PATTERN
+        #define GB_DECLAREB(bval)
+        #define GB_SHAREDB(bval)
+        #define GB_GETB( bval, bx, p)
+    #else
+        #define GB_DECLAREB(bval) T_X bval
+        #define GB_SHAREDB(bval) __shared__ T_X bval
+        #if GB_B_ISO
+            #define GB_GETB( bval, bx, p) bval = (T_X) (bx [0]) ;
+        #else
+            #define GB_GETB( bval, bx, p) bval = (T_X) (bx [p]) ;
+        #endif
+    #endif
+
+#else
+
+    #if GB_A_IS_PATTERN
+        #define GB_DECLAREA(aval)
+        #define GB_SHAREDA(aval)
+        #define GB_GETA( aval, ax, p)
+    #else
+        #define GB_DECLAREA(aval) T_X aval
+        #define GB_SHAREDA(aval) __shared__ T_X aval
+        #if GB_A_ISO
+            #define GB_GETA( aval, ax, p) aval = (T_X) (ax [0]) ;
+        #else
+            #define GB_GETA( aval, ax, p) aval = (T_X) (ax [p]) ;
+        #endif
+    #endif
+
+    #if GB_B_IS_PATTERN
+        #define GB_DECLAREB(bval)
+        #define GB_SHAREDB(bval)
+        #define GB_GETB( bval, bx, p)
+    #else
+        #define GB_DECLAREB(bval) T_Y bval
+        #define GB_SHAREDB(bval) __shared__ T_Y bval
+        #if GB_B_ISO
+            #define GB_GETB( bval, bx, p) bval = (T_Y) (bx [0]) ;
+        #else
+            #define GB_GETB( bval, bx, p) bval = (T_Y) (bx [p]) ;
+        #endif
+    #endif
+
+#endif
+
+//------------------------------------------------------------------------------
+// operators
+//------------------------------------------------------------------------------
+
+#if GB_C_ISO
+
+    #define GB_ADD_F( f , s)
+    #define GB_C_MULT( c, a, b)
+    #define GB_MULTADD( c, a ,b )
+    #define GB_DOT_TERMINAL ( c )   
+    #define GB_DOT_MERGE                                                \
+    {                                                                   \
+        cij_exists = true ;                                             \
+    }
+
+#else
+
+    #define GB_ADD_F( f , s)  f = GB_ADD ( f, s ) 
+    #define GB_C_MULT( c, a, b)  c = GB_MULT( (a), (b) )
+    #define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
+    #define GB_DOT_TERMINAL ( c )   
+    //# if ( c == TERMINAL_VALUE) break;
+    // cij += A(k,i) * B(k,j), for merge operation
+
+    #define GB_DOT_MERGE                                                \
+    {                                                                   \
+        GB_GETA ( aki, Ax, pA) ;       /* aki = A(k,i) */               \
+        GB_GETB ( bkj, Bx, pB) ;       /* bkj = B(k,j) */               \
+        if (cij_exists)                                                 \
+        {                                                               \
+            GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */      \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            /* cij = A(k,i) * B(k,j), and add to the pattern    */      \
+            cij_exists = true ;                                         \
+            GB_C_MULT (cij, aki, bkj) ;     /* cij  = aki * bkj */      \
+        }                                                               \
+    }
+
+#endif
+
+//------------------------------------------------------------------------------
+// subset of GraphBLAS.h
+//------------------------------------------------------------------------------
+
+#ifndef GRAPHBLAS_H
+#define GRAPHBLAS_H
+
+#undef restrict
+#undef GB_restrict
+#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
+    #define GB_restrict __restrict__
+#else
+    #define GB_restrict
+#endif
+#define restrict GB_restrict
+
+#include <stdint.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <string.h>
+
+// GB_STR: convert the content of x into a string "x"
+#define GB_XSTR(x) GB_STR(x)
+#define GB_STR(x) #x
+
+#undef  GB_PUBLIC
+#define GB_PUBLIC extern
+#undef  GxB_MAX_NAME_LEN
+#define GxB_MAX_NAME_LEN 128
+
+typedef uint64_t GrB_Index ;
+typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
+typedef struct GB_Type_opaque *GrB_Type ;
+typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
+typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
+typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
+typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
+typedef struct GB_Monoid_opaque *GrB_Monoid ;
+typedef struct GB_Semiring_opaque *GrB_Semiring ;
+typedef struct GB_Scalar_opaque *GrB_Scalar ;
+typedef struct GB_Vector_opaque *GrB_Vector ;
+typedef struct GB_Matrix_opaque *GrB_Matrix ;
+
+#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
+#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
+#define GxB_BITMAP      4   // store matrix as a bitmap
+#define GxB_FULL        8   // store matrix as full; all entries must be present
+
+typedef void (*GxB_unary_function)  (void *, const void *) ;
+typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
+
+typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
+(
+    GrB_Index i,                // row index of A(i,j)
+    GrB_Index j,                // column index of A(i,j)
+    const void *x,              // value of A(i,j)
+    const void *thunk           // optional input for select function
+) ;
+
+typedef void (*GxB_index_unary_function)
+(
+    void *z,            // output value z, of type ztype
+    const void *x,      // input value x of type xtype; value of v(i) or A(i,j)
+    GrB_Index i,        // row index of A(i,j)
+    GrB_Index j,        // column index of A(i,j), or zero for v(i)
+    const void *y       // input scalar y
+) ;
+
+typedef enum
+{
+    // for all GrB_Descriptor fields:
+    GxB_DEFAULT = 0,    // default behavior of the method
+
+    // for GrB_OUTP only:
+    GrB_REPLACE = 1,    // clear the output before assigning new values to it
+
+    // for GrB_MASK only:
+    GrB_COMP = 2,       // use the structural complement of the input
+    GrB_SCMP = 2,       // same as GrB_COMP (historical; use GrB_COMP instead)
+    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values
+
+    // for GrB_INP0 and GrB_INP1 only:
+    GrB_TRAN = 3,       // use the transpose of the input
+
+    // for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
+    GxB_GPU_ALWAYS  = 2001,
+    GxB_GPU_NEVER   = 2002,
+
+    // for GxB_AxB_METHOD only:
+    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
+    GxB_AxB_DOT       = 1003,   // dot product
+    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
+    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
+}
+GrB_Desc_Value ;
+
+#include "GB_opaque.h"
+#endif
+
+//------------------------------------------------------------------------------
+// subset of GB.h
+//------------------------------------------------------------------------------
+
+#include "GB_imin.h"
+#include "GB_zombie.h"
+#include "GB_nnz.h"
+#include "GB_partition.h"
+#include "GB_binary_search.h"
+#include "GB_search_for_vector_template.c"
+