Merge pull request #28 from TheCodeinator/20-use-chrono-for-microbenc…

…hmarks WIP:20 use chrono for microbenchmarks
TheCodeinator · Sep 30, 2023 · 5461bd1 · 5461bd1
2 parents 07e964d + 9236e91
commit 5461bd1
Show file tree

Hide file tree

Showing 45 changed files with 22,566 additions and 125 deletions.
diff --git a/benchmarks/01_put_coalescing/nvshmem_put_coalescing.cu b/benchmarks/01_put_coalescing/nvshmem_put_coalescing.cu
@@ -58,7 +58,7 @@ __device__ void send_multi_thread_sep(uint8_t *const data,
                                       const uint64_t n_elems,
                                       const uint64_t n_iterations) {
     const uint32_t thread_global_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint32_t thread_stride = blockDim.x * gridDim.y;
+    const uint32_t thread_stride = blockDim.x * gridDim.x;
 
     // start for loop together
     __syncthreads();
@@ -104,7 +104,7 @@ __global__ void exchange_data(int this_pe,
                               TestCase test_case) {
     const int other_pe = static_cast<int>(!this_pe); // there are two PEs in total
     const uint32_t thread_global_id = blockIdx.x * blockDim.x + threadIdx.x;
-    const uint32_t thread_stride = blockDim.x * gridDim.y;
+    const uint32_t thread_stride = blockDim.x * gridDim.x;
 
     // PE 0 is the sender
     if (this_pe == 0) {
@@ -122,7 +122,7 @@ __global__ void exchange_data(int this_pe,
     } else { // PE 1 is the receiver
         // receiver does not do anything but waiting, only needs one thread in all scenarios
         if (thread_global_id == 0) {
-                recv(data, other_pe, n_elems, n_iterations);
+            recv(data, other_pe, n_elems, n_iterations);
         }
     }
 }
@@ -152,8 +152,16 @@ int main(int argc, char *argv[]) {
     cudaSetDevice(this_pe);
     cudaStreamCreate(&stream);
 
-    // this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE
-    assert(n_pes == 2);
+    if (n_pes != 2) {
+        throw std::logic_error(
+                "this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE.");
+    }
+
+    if (n_elems / (grid_dim * block_dim) < 1 || n_elems % (grid_dim * block_dim) != 0) {
+        throw std::logic_error(
+                "Make sure the number of elements is a multiple of the total number of threads"
+        );
+    }
 
     // allocate symmetric device memory for sending/receiving the data
     auto *const data = static_cast<uint8_t *>(nvshmem_malloc(n_elems));
@@ -171,7 +179,7 @@ int main(int argc, char *argv[]) {
     nvshmem_free(data);
 
     if (this_pe == 0) {
-        for (const auto &meas : measurements) {
+        for (const auto &meas: measurements) {
             std::cout << "," << gb_per_sec(meas, n_iterations * n_elems);
         }
         std::cout << std::endl;

diff --git a/benchmarks/03_packet_size_put_nbi/packet_size.cu b/benchmarks/03_packet_size_put_nbi/packet_size.cu
@@ -103,11 +103,16 @@ int main(int argc, char *argv[]) {
     cudaSetDevice(this_pe);
     cudaStreamCreate(&stream);
 
-    // this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE
     if (n_pes != 2) {
         throw std::logic_error("This test has to be started with exactly 2 PEs.");
     }
 
+    if(n_elems / (block_dim * grid_dim * MAX_SEND_SIZE) < 1 || n_elems % (block_dim * grid_dim * MAX_SEND_SIZE) != 0) {
+        throw std::logic_error(
+                "Make sure that the number of elements is a multiple of the product of the total number of threads and the maximum msg size (" +
+                std::to_string(MAX_SEND_SIZE) + ").");
+    }
+
     // allocate symmetric device memory for sending/receiving the data
     auto *const data = static_cast<uint8_t *>(nvshmem_malloc(MAX_SEND_SIZE * block_dim * grid_dim));
 

diff --git a/benchmarks/04_packet_size_fcollect/packet_size.cu b/benchmarks/04_packet_size_fcollect/packet_size.cu
@@ -30,8 +30,6 @@ __global__ void exchange_data(uint8_t *const data_src,
     nvshmem_barrier_all();
 }
 
-// TODO: use host to measure time since GPU clock frq. can change dynamically and is therefore not reliable
-
 /**
  * cmd arguments:
  * 0) program name (implicit)
@@ -53,8 +51,15 @@ int main(int argc, char *argv[]) {
     cudaSetDevice(this_pe);
     cudaStreamCreate(&stream);
 
-    // this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE
-    assert(n_pes == 2);
+    if (n_pes != 2) {
+        throw std::logic_error(
+                "this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE.");
+    }
+
+    if (n_bytes / MAX_SEND_SIZE < 1 || n_bytes % MAX_SEND_SIZE != 0) {
+        throw std::logic_error("Make sure that the number of bytes to send is divisible by the maximum send size (" +
+                               std::to_string(MAX_SEND_SIZE) + ").");
+    }
 
     // allocate symmetric device memory for sending/receiving the data
     auto *const data_src = static_cast<uint8_t *>(nvshmem_malloc(MAX_SEND_SIZE));

diff --git a/benchmarks/05_1_single_multi_launch_simple/CMakeLists.txt b/benchmarks/05_1_single_multi_launch_simple/CMakeLists.txt
@@ -0,0 +1,10 @@
+project(bench_05_single_multi_launch_simple LANGUAGES CXX CUDA)
+
+add_executable(bench_05_single_multi_launch_simple single_multi_launch_simple.cu)
+
+add_dependencies(bench_05_single_multi_launch_simple nvshmem nvshmem-db)
+target_link_libraries(bench_05_single_multi_launch_simple nvshmem nvshmem-db)
+
+set_property(TARGET bench_05_single_multi_launch_simple PROPERTY POSITION_INDEPENDENT_CODE ON)
+
+set_target_properties(bench_05_single_multi_launch_simple PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/benchmarks/05_1_single_multi_launch_simple/run.sh b/benchmarks/05_1_single_multi_launch_simple/run.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+launches=(2 4 8 16 32 64 128)
+
+output_file="results.csv"
+rm -f $output_file
+touch $output_file
+echo "type,launches,time_single,time_multi" > $output_file
+
+for l in "${launches[@]}"; do
+  echo "Running for $l launches"
+  ./bench_05_single_multi_launch_simple "$l" >> $output_file
+done
diff --git a/benchmarks/05_1_single_multi_launch_simple/single_multi_launch_simple.cu b/benchmarks/05_1_single_multi_launch_simple/single_multi_launch_simple.cu
@@ -0,0 +1,122 @@
+#include <cuda_runtime.h>
+#include <iostream>
+#include <fstream>
+#include <chrono>
+#include <vector>
+#include <string>
+#include <assert.h>
+#include "Macros.cuh"
+
+ __constant__ uint32_t work_size = 1000;
+
+
+enum class OccupancyMode {
+    SLEEP = 0,
+    LOOP = 1
+};
+
+/*
+    Short running
+*/
+template<OccupancyMode occupancy>
+__global__ void calculate(size_t num_launches, int* res, double* approx) {
+
+    if constexpr (occupancy == OccupancyMode::SLEEP) {
+        //c.f. calculate_long
+        __nanosleep(1000000000U);
+        *res += 1;
+    }
+    else if constexpr (occupancy == OccupancyMode::LOOP){
+        // Approximate pi/4 https://en.wikipedia.org/wiki/Leibniz_formula_for_π
+        for(uint32_t i {0}; i<work_size*num_launches; i++){
+            *approx += pow((-1),i)/(2*i+1);
+        }
+        *res += 1;
+    }
+
+}
+
+/*
+    Long running kernel over whole domain
+*/
+template<OccupancyMode occupancy>
+__global__ void calculate_parts(size_t num_launches, int* res, double* approx) {
+
+    if constexpr (occupancy == OccupancyMode::SLEEP) {
+        // Compute capability >= 7.0 (V100)
+        __nanosleep(100 / num_launches);
+        *res += 1;
+    }
+    else if constexpr (occupancy == OccupancyMode::LOOP){
+        // Approximate pi/4 https://en.wikipedia.org/wiki/Leibniz_formula_for_π
+        for(uint32_t i {0}; i<work_size; i++){
+            *approx += pow((-1),i)/(2*i+1);
+        }
+        *res += 1;
+    }
+}
+
+// args:
+// 1: num_launches
+int main(int argc, char *argv[]) {
+
+    assert(argc == 2);
+    const uint32_t num_launches = std::stoull(argv[1]);
+
+    CUDA_CHECK(cudaSetDevice(0));
+    cudaStream_t stream1;
+    CUDA_CHECK(cudaStreamCreate(&stream1));
+
+    int* res;
+    double* approx;
+    CUDA_CHECK(cudaMalloc((void**)&res, sizeof(int)));
+    CUDA_CHECK(cudaMalloc((void**)&approx, sizeof(double)));
+    CUDA_CHECK(cudaMemset(res, 0, sizeof(int)));
+    CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double)));
+
+    // Warm up CUDA context
+    calculate<OccupancyMode::SLEEP><<<1,1,0,stream1>>>(num_launches,res, approx);
+    cudaStreamSynchronize(stream1);
+
+    CUDA_CHECK(cudaMemset(res, 0, sizeof(int)));
+    CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double)));
+
+    auto start = std::chrono::steady_clock::now();
+
+    calculate<OccupancyMode::LOOP><<<1,1,0,stream1>>>(num_launches,res, approx);
+    cudaStreamSynchronize(stream1);
+
+    auto stop = std::chrono::steady_clock::now();
+
+    int* host_res = reinterpret_cast<int*>(malloc(sizeof(int)));
+    double* host_approx = reinterpret_cast<double*>(malloc(sizeof(double)));
+    CUDA_CHECK(cudaMemcpy(host_res, res, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpy(host_approx, approx, sizeof(double), cudaMemcpyDeviceToHost));
+    assert(*host_res == 1);
+
+    auto dur = stop-start;
+    auto t_ms = dur.count() * 1e-6;
+
+    CUDA_CHECK(cudaMemset(res, 0, sizeof(int)));
+    CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double)));
+
+    auto start2 = std::chrono::steady_clock::now();
+
+    for(int i{0}; i<num_launches;i++) {
+        calculate_parts<OccupancyMode::LOOP><<<1, 1, 0, stream1>>>(num_launches, res, approx);
+        cudaStreamSynchronize(stream1);
+    }
+
+    auto stop2 = std::chrono::steady_clock::now();
+
+    CUDA_CHECK(cudaMemcpy(host_res, res, sizeof(int), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(cudaMemcpy(host_approx, approx, sizeof(double), cudaMemcpyDeviceToHost));
+    assert(*host_res == num_launches);
+
+    auto dur2 = stop2 - start2;
+    auto t_ms2 = dur2.count() * 1e-6;
+
+    std::cout << "05_single_multi_launch_simple" << "," << num_launches << "," << t_ms << "," << t_ms2 << std::endl;
+
+    return EXIT_SUCCESS;
+}
diff --git a/benchmarks/05_single_multi_launch/CMakeLists.txt b/benchmarks/05_single_multi_launch/CMakeLists.txt
@@ -1,6 +1,10 @@
 project(bench_05_single_multi_launch LANGUAGES CXX CUDA)
+
 add_executable(bench_05_single_multi_launch single_multi_launch.cu)
+
 add_dependencies(bench_05_single_multi_launch nvshmem nvshmem-db rdmapp)
 target_link_libraries(bench_05_single_multi_launch nvshmem nvshmem-db rdmapp)
+
 set_property(TARGET bench_05_single_multi_launch PROPERTY POSITION_INDEPENDENT_CODE ON)
+
 set_target_properties(bench_05_single_multi_launch PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/benchmarks/05_single_multi_launch/run.sh b/benchmarks/05_single_multi_launch/run.sh
@@ -1,13 +1,19 @@
 #!/bin/bash
 
-input_size = (1000,10000,100000,1000000)
+# disable communication over NVLINK or PCI
+export PATH=$PATH:/opt/hydra/bin
+export NVSHMEM_DISABLE_P2P=true
+
+#input_size=(1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304)
+input_size=(1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728)
 
 output_file="results.csv"
 rm -f $output_file
 touch $output_file
-echo "type,num_bytes,launches,time_nvshmem,time_rdma" > $output_file
+echo "type,num_bytes,num_bytes_buffer,launches,time_nvshmem,time_rdma" > $output_file
 
 for size in "${input_size[@]}"; do
   echo "Running for input size $size"
-  nvshmrun -np 2 ./bench_05_single_multi_launch $size 172.18.94.10 172.18.94.11 > $output_file
+  # for each node x ip for ib y is 172.18.94.xy
+  nvshmrun -n 2 -ppn 1 --hosts 10.0.2.11,10.0.2.12 ./bench_05_single_multi_launch "$size" 172.18.94.10 172.18.94.20 > $output_file
 done