Merge pull request #149 from DrTimothyAldenDavis/master

Master
DrTimothyAldenDavis · Jul 9, 2022 · 5e7c3ed · 5e7c3ed
2 parents 1a654bb + 385333b
commit 5e7c3ed
Show file tree

Hide file tree

Showing 125 changed files with 2,479 additions and 1,326 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,10 +26,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "June 17, 2022" )
+set ( GraphBLAS_DATE "July 8, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
 set ( GraphBLAS_VERSION_MINOR 1 )
-set ( GraphBLAS_VERSION_SUB   1 )
+set ( GraphBLAS_VERSION_SUB   2 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 

diff --git a/CUDA/CMakeLists.txt b/CUDA/CMakeLists.txt
@@ -34,7 +34,24 @@ set(GRAPHBLAS_CUDA_INCLUDES
 
 message(STATUS "GraphBLAS CUDA includes: " "${GRAPHBLAS_CUDA_INCLUDES}")
 
-target_include_directories(graphblascuda PUBLIC  ${CUDAToolkit_INCLUDE_DIRS} ${GRAPHBLAS_CUDA_INCLUDES})
+set(EXTERNAL_INCLUDES_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
+
+IF(NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY})
+    file(MAKE_DIRECTORY ${EXTERNAL_INCLUDES_DIRECTORY})
+endif()
+
+IF(NOT EXISTS ${EXTERNAL_INCLUDES_DIRECTORY}/cuco)
+    execute_process(
+            COMMAND git clone "https://github.com/NVIDIA/cuCollections.git" --branch main --recursive cuco
+            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/external_includes)
+endif()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/external_includes/cuco/include)
+
+target_include_directories(graphblascuda PUBLIC
+        ${CMAKE_CURRENT_BINARY_DIR}/external_includes/cuco/include
+        ${CUDAToolkit_INCLUDE_DIRS}
+        ${GRAPHBLAS_CUDA_INCLUDES})
 set_target_properties(graphblascuda PROPERTIES POSITION_INDEPENDENT_CODE ON)
 set_target_properties(graphblascuda PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 set_target_properties(graphblascuda PROPERTIES CUDA_ARCHITECTURES "75")
@@ -46,7 +63,6 @@ install ( TARGETS graphblascuda
         PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
         ARCHIVE       DESTINATION ${CMAKE_INSTALL_LIBDIR} )
 
-
 # 1. Execute enumify/stringify/jitify logic to compile ptx kernels and compile/link w/ relevant *.cu files.
 
 # TODO: Need to do this piece in cmake
@@ -58,10 +74,10 @@ set(CUDA_TEST_SUITES
 )
 
 #
-set(CUDA_TEST_MONOIDS PLUS) #MIN MAX TIMES ANY)
-set(CUDA_TEST_BINOPS TIMES) #PLUS MIN MAX DIV MINUS RDIV RMINUS FIRST SECOND PAIR)
-set(CUDA_TEST_SEMIRINGS PLUS_TIMES) # MIN_PLUS MAX_PLUS)
-set(CUDA_TEST_DATATYPES int32_t ) #int64_t uint32_t uint64_t float double)
+set(CUDA_TEST_MONOIDS PLUS MIN MAX) # TIMES ANY)
+set(CUDA_TEST_BINOPS TIMES PLUS MIN MAX DIV) #MINUS RDIV RMINUS FIRST SECOND PAIR)
+set(CUDA_TEST_SEMIRINGS PLUS_TIMES MIN_PLUS MAX_PLUS)
+set(CUDA_TEST_DATATYPES int32_t int64_t uint32_t uint64_t float double)
 set(CUDA_TEST_KERNELS vsvs) # mp vsvs dndn spdn vssp)
 
 

diff --git a/CUDA/GB_AxB_dot3_cuda.cpp b/CUDA/GB_AxB_dot3_cuda.cpp
@@ -22,6 +22,7 @@ extern "C"
 
 #include "jitFactory.hpp"
 #include "GB_cuda_type_wrap.hpp"
+#include "test/GpuTimer.h"
 
 template<typename T, typename I>
 void print_array(void *arr, I size, const char *name) {
@@ -68,6 +69,8 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     cudaStream_t stream;
     CHECK_CUDA_SIMPLE(cudaStreamCreate(&stream));
 
+    GpuTimer kernel_timer; 
+
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
@@ -149,6 +152,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
         M_sparsity, false, M->hyper_switch, cnvec,
         cnz+1,  // add one to cnz for GB_cumsum of Cwork 
         true, C_iso, Context) ;
+
     if (info != GrB_SUCCESS)
     { 
         // out of memory
@@ -173,12 +177,39 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
             cudaMemAdviseSetPreferredLocation, device));
     }
 
+    //--------------------------------------------------------------------------
+    // Pre-fetch arrays that will be used on the device
+    //--------------------------------------------------------------------------
+
+    // prefetch M
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t),
+        device, stream)) ; //stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t),
+        device, stream )) ; //stream_data) ;
+    if (!(Mask_struct || M->iso))
+    {
+        // prefetch M->x only if the mask is valued and M is non-iso
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size,
+            device, stream )) ; //stream_data) ;
+    }
+
+    // prefetch C
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
+        device, stream )); //stream_data) ;
+    if (!C_iso)
+    {
+        // FIXME: why prefect C->x?
+        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
+            device, stream )); //stream_data) ;
+    }
+
     //--------------------------------------------------------------------------
     // copy Mp and Mh into C
     //--------------------------------------------------------------------------
 
     CHECK_CUDA_SIMPLE( cudaMemcpyAsync (C->p, M->p, (cnvec+1) * sizeof (int64_t),
         cudaMemcpyDefault, stream)) ;
+    //memcpy( C->p, M->p, (cnvec+1)* sizeof( int64_t) );
     if (M_is_hyper)
     { 
         // FIXME: this method does not yet handle the hypersparse case
@@ -207,7 +238,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
     filecache.getFile (my_mxm_spec) ;
 
-    GBURBLE ("(GPU stringified) ") ;
+    GBURBLE ("(GPU stringified srcode = %lu)\n", my_mxm_spec.sr_code) ;
 
     //--------------------------------------------------------------------------
     // construct the tasks for phase1 and phase2
@@ -234,8 +265,8 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     Blockbucket = (int64_t*)
         rmm_wrap_malloc(blockbuckets_size * sizeof (int64_t));
     Bucketp = (int64_t*)rmm_wrap_malloc((NBUCKETS+1) * sizeof (int64_t));
-    Bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
     offset = (int64_t*)rmm_wrap_malloc(NBUCKETS * sizeof (int64_t));
+    Bucket = (int64_t*)rmm_wrap_malloc(mnz * sizeof (int64_t));
     if (Nanobuckets == NULL || Blockbucket == NULL || Bucketp == NULL
         || Bucket == NULL || offset == NULL)
     {
@@ -246,16 +277,16 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 
     // fixme: do async with streams
     // FIXME: do we need any of these?
-    CHECK_CUDA_SIMPLE(cudaMemsetAsync(Nanobuckets, 0,
-        nanobuckets_size * sizeof(int64_t), stream));
-    CHECK_CUDA_SIMPLE(cudaMemsetAsync(Blockbucket, 0,
-        blockbuckets_size * sizeof(int64_t), stream));
+//  CHECK_CUDA_SIMPLE(cudaMemsetAsync(Nanobuckets, 0,
+//      nanobuckets_size * sizeof(int64_t), stream));
+//  CHECK_CUDA_SIMPLE(cudaMemsetAsync(Blockbucket, 0,
+//      blockbuckets_size * sizeof(int64_t), stream));
     CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucketp, 0,
         (NBUCKETS+1) * sizeof(int64_t), stream));
-    CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucket, 0,
-        mnz * sizeof(int64_t), stream));
     CHECK_CUDA_SIMPLE(cudaMemsetAsync(offset, 0,
         NBUCKETS * sizeof(int64_t), stream));
+  //CHECK_CUDA_SIMPLE(cudaMemsetAsync(Bucket, 0,
+  //    mnz * sizeof(int64_t), stream));
 
     //--------------------------------------------------------------------------
     // phase1 and phase2: place each C(i,j) in a bucket
@@ -287,15 +318,19 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
             device, stream )) ; //stream_data) ;
     }
 
-    // prefetch C
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
-        device, stream )); //stream_data) ;
-    if (!C_iso)
-    {
-        // FIXME: why prefect C->x?
-        CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
-            device, stream )); //stream_data) ;
-    }
+//  // prefetch C
+//  CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t),
+//      device, stream )); //stream_data) ;
+//  if (!C_iso)
+//  {
+//      // FIXME: why prefect C->x?
+//      CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size,
+//          device, stream )); //stream_data) ;
+//  }
+
+    //--------------------------------------------------------------------------
+    // Pre-fetch arrays that will be used on the device
+    //--------------------------------------------------------------------------
 
     // prefetch A
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t),
@@ -327,56 +362,71 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     //--------------------------------------------------------------------------
 
     GBURBLE ("(GPU phase1 start) ") ;
-
+    kernel_timer.Start();
     p1lf.jitGridBlockLaunch(Nanobuckets, Blockbucket, C, M, A, B, stream);
+    CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+    kernel_timer.Stop();
 
-    GBURBLE ("(GPU phase1 done) ") ;
+    GBURBLE ("(GPU phase1 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
 
     //--------------------------------------------------------------------------
     // phase2: cumsum across the blockbuckets, propagate to thread level
     //--------------------------------------------------------------------------
 
-    GBURBLE ("(GPU phase1 start) ") ;
+    GBURBLE ("(GPU phase2 start nblk=%d ) ", ntasks) ;
 
+    kernel_timer.Start();
     p2lf.jitGridBlockLaunch(Blockbucket, offset, M, stream);
+    kernel_timer.Stop();
 
     CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
 
     int64_t s= offset[0];
     C->nzombies = s;
+    bool all_in_one = false;
     for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
     {
         Bucketp[bucket] = s; 
         s+= offset[bucket];
+        if ( (Bucketp[bucket] - Bucketp[bucket-1] ) == mnz ) all_in_one = true;
     }
 
-    GBURBLE ("(GPU phase2 done) ") ;
+    GBURBLE ("(GPU phase2 done %12.6g ms )\n", kernel_timer.Elapsed()) ;
 
-    GBURBLE ("(GPU phase2end start) ") ;
+    if( !all_in_one) 
+    {
+        GBURBLE ("(GPU phase2end start nblk=%d) ",  ntasks) ;
 
-    p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
-                             Bucketp, Bucket, offset, C, M, stream);
+        kernel_timer.Start();
+        p2elf.jitGridBlockLaunch(Nanobuckets, Blockbucket,
+                                 Bucketp, Bucket, offset, C, M, stream);
 
-    GBURBLE ("(GPU phase2end done) ") ;
+        CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
+        kernel_timer.Stop();
+        GBURBLE ("(GPU phase2end done %12.6g ms)\n",kernel_timer.Elapsed()) ;
+    }
 
     //--------------------------------------------------------------------------
     // phase3: do the numerical work
     //--------------------------------------------------------------------------
 
-    CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));
 
     for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
     {
         int64_t start = Bucketp[bucket];
         int64_t end   = Bucketp[bucket + 1 ];
+      //int64_t start = 0;
+      //int64_t end   = cnz;
 
         if(end - start > 0) {
             // TODO: Use stream pool
             phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
-            p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket,
-                C, M, A, B, stream);
-            GBURBLE ("(GPU phase3 bucket %d done ) ", bucket) ;
-        }
+            GBURBLE ("(GPU phase3 bucket %d launch ) ", bucket) ;
+            kernel_timer.Start();
+            p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B, stream);
+            CHECK_CUDA_SIMPLE(cudaStreamSynchronize(stream));  // only for timing
+            kernel_timer.Stop();
+            GBURBLE ("(GPU phase3 bucket %d done %12.6g ms)\n", bucket, kernel_timer.Elapsed()) ; }
     }
 
     GB_FREE_WORKSPACE ;

diff --git a/CUDA/GB_cuda_atomics.cuh b/CUDA/GB_cuda_atomics.cuh
@@ -45,7 +45,9 @@ template <typename T>
 __device__ void atomic_sub(T* ptr, T val);
 
 template<> __device__ __inline__ void atomic_add<int>(int* ptr, int val) { atomicAdd(ptr, val); }
+template<> __device__ __inline__ void atomic_add<uint32_t>(uint32_t* ptr, uint32_t val) { atomicAdd((unsigned int*)ptr, (unsigned int)val); }
 template<> __device__ __inline__ void atomic_add<int64_t>(int64_t* ptr, int64_t val) { atomicAdd((unsigned long long*)ptr, (unsigned long long)val); }
+template<> __device__ __inline__ void atomic_add<uint64_t>(uint64_t* ptr, uint64_t val) { atomicAdd((unsigned long long*)ptr, (unsigned long long)val); }
 template<> __device__ __inline__ void atomic_add<float>(float* ptr, float val) { atomicAdd(ptr, val); }
 template<> __device__ __inline__ void atomic_add<double>(double* ptr, double val) { atomicAdd(ptr, val); }
 

diff --git a/CUDA/GB_cuda_kernel.h b/CUDA/GB_cuda_kernel.h
@@ -93,10 +93,11 @@
     #define GB_C_MULT( c, a, b)
     #define GB_MULTADD( c, a ,b )
     #define GB_DOT_TERMINAL ( c )   
-    #define GB_DOT_MERGE                                                \
+    #define GB_DOT_MERGE(pA,pB)                                         \
     {                                                                   \
         cij_exists = true ;                                             \
     }
+    #define GB_CIJ_EXIST_POSTCHECK
 
 #else
 
@@ -105,33 +106,37 @@
     #define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
     #define GB_DOT_TERMINAL ( c )
     //# if ( c == TERMINAL_VALUE) break;
-    // cij += A(k,i) * B(k,j), for merge operation
 
-    #define GB_DOT_MERGE                                                \
-    {                                                                   \
-        GB_GETA (aki, Ax, pA) ;         /* aki = A(k,i) */              \
-        GB_GETB (bkj, Bx, pB) ;         /* bkj = B(k,j) */              \
-        cij_exists = true ;                                             \
-        GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */          \
-    }
+    #if GB_IS_PLUS_PAIR_REAL_SEMIRING
 
-#if 0
-    #define GB_DOT_MERGE                                                \
-    {                                                                   \
-        GB_GETA ( aki, Ax, pA) ;       /* aki = A(k,i) */               \
-        GB_GETB ( bkj, Bx, pB) ;       /* bkj = B(k,j) */               \
-        if (cij_exists)                                                 \
-        {                                                               \
-            GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */      \
-        }                                                               \
-        else                                                            \
-        {                                                               \
-            /* cij = A(k,i) * B(k,j), and add to the pattern    */      \
-            cij_exists = true ;                                         \
-            GB_C_MULT (cij, aki, bkj) ;     /* cij  = aki * bkj */      \
-        }                                                               \
-    }
-#endif
+        // cij += A(k,i) * B(k,j), for merge operation (plus_pair_real semiring)
+        #if GB_ZTYPE_IGNORE_OVERFLOW
+            // plus_pair for int64, uint64, float, or double
+            #define GB_DOT_MERGE(pA,pB) cij++ ;
+            #define GB_CIJ_EXIST_POSTCHECK cij_exists = (cij != 0) ;
+        #else
+            // plus_pair semiring for small integers
+            #define GB_DOT_MERGE(pA,pB)                                     \
+            {                                                               \
+                cij_exists = true ;                                         \
+                cij++ ;                                                     \
+            }
+            #define GB_CIJ_EXIST_POSTCHECK
+        #endif
+
+    #else
+
+        // cij += A(k,i) * B(k,j), for merge operation (general case)
+        #define GB_DOT_MERGE(pA,pB)                                         \
+        {                                                                   \
+            GB_GETA (aki, Ax, pA) ;         /* aki = A(k,i) */              \
+            GB_GETB (bkj, Bx, pB) ;         /* bkj = B(k,j) */              \
+            cij_exists = true ;                                             \
+            GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */          \
+        }
+        #define GB_CIJ_EXIST_POSTCHECK
+
+    #endif
 
 #endif