Skip to content

Commit

Permalink
Merge pull request #28 from TheCodeinator/20-use-chrono-for-microbenc…
Browse files Browse the repository at this point in the history
…hmarks

WIP:20 use chrono for microbenchmarks
  • Loading branch information
TheCodeinator authored Sep 30, 2023
2 parents 07e964d + 9236e91 commit 5461bd1
Show file tree
Hide file tree
Showing 45 changed files with 22,566 additions and 125 deletions.
20 changes: 14 additions & 6 deletions benchmarks/01_put_coalescing/nvshmem_put_coalescing.cu
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ __device__ void send_multi_thread_sep(uint8_t *const data,
const uint64_t n_elems,
const uint64_t n_iterations) {
const uint32_t thread_global_id = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t thread_stride = blockDim.x * gridDim.y;
const uint32_t thread_stride = blockDim.x * gridDim.x;

// start for loop together
__syncthreads();
Expand Down Expand Up @@ -104,7 +104,7 @@ __global__ void exchange_data(int this_pe,
TestCase test_case) {
const int other_pe = static_cast<int>(!this_pe); // there are two PEs in total
const uint32_t thread_global_id = blockIdx.x * blockDim.x + threadIdx.x;
const uint32_t thread_stride = blockDim.x * gridDim.y;
const uint32_t thread_stride = blockDim.x * gridDim.x;

// PE 0 is the sender
if (this_pe == 0) {
Expand All @@ -122,7 +122,7 @@ __global__ void exchange_data(int this_pe,
} else { // PE 1 is the receiver
// receiver does not do anything but waiting, only needs one thread in all scenarios
if (thread_global_id == 0) {
recv(data, other_pe, n_elems, n_iterations);
recv(data, other_pe, n_elems, n_iterations);
}
}
}
Expand Down Expand Up @@ -152,8 +152,16 @@ int main(int argc, char *argv[]) {
cudaSetDevice(this_pe);
cudaStreamCreate(&stream);

// this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE
assert(n_pes == 2);
if (n_pes != 2) {
throw std::logic_error(
"this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE.");
}

if (n_elems / (grid_dim * block_dim) < 1 || n_elems % (grid_dim * block_dim) != 0) {
throw std::logic_error(
"Make sure the number of elements is a multiple of the total number of threads"
);
}

// allocate symmetric device memory for sending/receiving the data
auto *const data = static_cast<uint8_t *>(nvshmem_malloc(n_elems));
Expand All @@ -171,7 +179,7 @@ int main(int argc, char *argv[]) {
nvshmem_free(data);

if (this_pe == 0) {
for (const auto &meas : measurements) {
for (const auto &meas: measurements) {
std::cout << "," << gb_per_sec(meas, n_iterations * n_elems);
}
std::cout << std::endl;
Expand Down
7 changes: 6 additions & 1 deletion benchmarks/03_packet_size_put_nbi/packet_size.cu
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,16 @@ int main(int argc, char *argv[]) {
cudaSetDevice(this_pe);
cudaStreamCreate(&stream);

// this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE
if (n_pes != 2) {
throw std::logic_error("This test has to be started with exactly 2 PEs.");
}

if(n_elems / (block_dim * grid_dim * MAX_SEND_SIZE) < 1 || n_elems % (block_dim * grid_dim * MAX_SEND_SIZE) != 0) {
throw std::logic_error(
"Make sure that the number of elements is a multiple of the product of the total number of threads and the maximum msg size (" +
std::to_string(MAX_SEND_SIZE) + ").");
}

// allocate symmetric device memory for sending/receiving the data
auto *const data = static_cast<uint8_t *>(nvshmem_malloc(MAX_SEND_SIZE * block_dim * grid_dim));

Expand Down
13 changes: 9 additions & 4 deletions benchmarks/04_packet_size_fcollect/packet_size.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@ __global__ void exchange_data(uint8_t *const data_src,
nvshmem_barrier_all();
}

// TODO: use host to measure time since GPU clock frq. can change dynamically and is therefore not reliable

/**
* cmd arguments:
* 0) program name (implicit)
Expand All @@ -53,8 +51,15 @@ int main(int argc, char *argv[]) {
cudaSetDevice(this_pe);
cudaStreamCreate(&stream);

// this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE
assert(n_pes == 2);
if (n_pes != 2) {
throw std::logic_error(
"this test is supposed to be executed on 2 PEs, each sends and receives data from the other PE.");
}

if (n_bytes / MAX_SEND_SIZE < 1 || n_bytes % MAX_SEND_SIZE != 0) {
throw std::logic_error("Make sure that the number of bytes to send is divisible by the maximum send size (" +
std::to_string(MAX_SEND_SIZE) + ").");
}

// allocate symmetric device memory for sending/receiving the data
auto *const data_src = static_cast<uint8_t *>(nvshmem_malloc(MAX_SEND_SIZE));
Expand Down
10 changes: 10 additions & 0 deletions benchmarks/05_1_single_multi_launch_simple/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
project(bench_05_single_multi_launch_simple LANGUAGES CXX CUDA)

add_executable(bench_05_single_multi_launch_simple single_multi_launch_simple.cu)

add_dependencies(bench_05_single_multi_launch_simple nvshmem nvshmem-db)
target_link_libraries(bench_05_single_multi_launch_simple nvshmem nvshmem-db)

set_property(TARGET bench_05_single_multi_launch_simple PROPERTY POSITION_INDEPENDENT_CODE ON)

set_target_properties(bench_05_single_multi_launch_simple PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
13 changes: 13 additions & 0 deletions benchmarks/05_1_single_multi_launch_simple/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

launches=(2 4 8 16 32 64 128)

output_file="results.csv"
rm -f $output_file
touch $output_file
echo "type,launches,time_single,time_multi" > $output_file

for l in "${launches[@]}"; do
echo "Running for $l launches"
./bench_05_single_multi_launch_simple "$l" >> $output_file
done
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#include <cuda_runtime.h>
#include <iostream>
#include <fstream>
#include <chrono>
#include <vector>
#include <string>
#include <assert.h>
#include "Macros.cuh"

__constant__ uint32_t work_size = 1000;


enum class OccupancyMode {
SLEEP = 0,
LOOP = 1
};

/*
Short running
*/
template<OccupancyMode occupancy>
__global__ void calculate(size_t num_launches, int* res, double* approx) {

if constexpr (occupancy == OccupancyMode::SLEEP) {
//c.f. calculate_long
__nanosleep(1000000000U);
*res += 1;
}
else if constexpr (occupancy == OccupancyMode::LOOP){
// Approximate pi/4 https://en.wikipedia.org/wiki/Leibniz_formula_for_π
for(uint32_t i {0}; i<work_size*num_launches; i++){
*approx += pow((-1),i)/(2*i+1);
}
*res += 1;
}

}

/*
Long running kernel over whole domain
*/
template<OccupancyMode occupancy>
__global__ void calculate_parts(size_t num_launches, int* res, double* approx) {

if constexpr (occupancy == OccupancyMode::SLEEP) {
// Compute capability >= 7.0 (V100)
__nanosleep(100 / num_launches);
*res += 1;
}
else if constexpr (occupancy == OccupancyMode::LOOP){
// Approximate pi/4 https://en.wikipedia.org/wiki/Leibniz_formula_for_π
for(uint32_t i {0}; i<work_size; i++){
*approx += pow((-1),i)/(2*i+1);
}
*res += 1;
}
}

// args:
// 1: num_launches
int main(int argc, char *argv[]) {

assert(argc == 2);
const uint32_t num_launches = std::stoull(argv[1]);

CUDA_CHECK(cudaSetDevice(0));
cudaStream_t stream1;
CUDA_CHECK(cudaStreamCreate(&stream1));

int* res;
double* approx;
CUDA_CHECK(cudaMalloc((void**)&res, sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&approx, sizeof(double)));
CUDA_CHECK(cudaMemset(res, 0, sizeof(int)));
CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double)));

// Warm up CUDA context
calculate<OccupancyMode::SLEEP><<<1,1,0,stream1>>>(num_launches,res, approx);
cudaStreamSynchronize(stream1);

CUDA_CHECK(cudaMemset(res, 0, sizeof(int)));
CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double)));

auto start = std::chrono::steady_clock::now();

calculate<OccupancyMode::LOOP><<<1,1,0,stream1>>>(num_launches,res, approx);
cudaStreamSynchronize(stream1);

auto stop = std::chrono::steady_clock::now();

int* host_res = reinterpret_cast<int*>(malloc(sizeof(int)));
double* host_approx = reinterpret_cast<double*>(malloc(sizeof(double)));
CUDA_CHECK(cudaMemcpy(host_res, res, sizeof(int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(host_approx, approx, sizeof(double), cudaMemcpyDeviceToHost));
assert(*host_res == 1);

auto dur = stop-start;
auto t_ms = dur.count() * 1e-6;

CUDA_CHECK(cudaMemset(res, 0, sizeof(int)));
CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double)));

auto start2 = std::chrono::steady_clock::now();

for(int i{0}; i<num_launches;i++) {
calculate_parts<OccupancyMode::LOOP><<<1, 1, 0, stream1>>>(num_launches, res, approx);
cudaStreamSynchronize(stream1);
}

auto stop2 = std::chrono::steady_clock::now();

CUDA_CHECK(cudaMemcpy(host_res, res, sizeof(int), cudaMemcpyDeviceToHost));
CUDA_CHECK(cudaMemcpy(host_approx, approx, sizeof(double), cudaMemcpyDeviceToHost));
assert(*host_res == num_launches);

auto dur2 = stop2 - start2;
auto t_ms2 = dur2.count() * 1e-6;

std::cout << "05_single_multi_launch_simple" << "," << num_launches << "," << t_ms << "," << t_ms2 << std::endl;

return EXIT_SUCCESS;
}
4 changes: 4 additions & 0 deletions benchmarks/05_single_multi_launch/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
project(bench_05_single_multi_launch LANGUAGES CXX CUDA)

add_executable(bench_05_single_multi_launch single_multi_launch.cu)

add_dependencies(bench_05_single_multi_launch nvshmem nvshmem-db rdmapp)
target_link_libraries(bench_05_single_multi_launch nvshmem nvshmem-db rdmapp)

set_property(TARGET bench_05_single_multi_launch PROPERTY POSITION_INDEPENDENT_CODE ON)

set_target_properties(bench_05_single_multi_launch PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
12 changes: 9 additions & 3 deletions benchmarks/05_single_multi_launch/run.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
#!/bin/bash

input_size = (1000,10000,100000,1000000)
# disable communication over NVLINK or PCI
export PATH=$PATH:/opt/hydra/bin
export NVSHMEM_DISABLE_P2P=true

#input_size=(1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304)
input_size=(1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728)

output_file="results.csv"
rm -f $output_file
touch $output_file
echo "type,num_bytes,launches,time_nvshmem,time_rdma" > $output_file
echo "type,num_bytes,num_bytes_buffer,launches,time_nvshmem,time_rdma" > $output_file

for size in "${input_size[@]}"; do
echo "Running for input size $size"
nvshmrun -np 2 ./bench_05_single_multi_launch $size 172.18.94.10 172.18.94.11 > $output_file
# for each node x ip for ib y is 172.18.94.xy
nvshmrun -n 2 -ppn 1 --hosts 10.0.2.11,10.0.2.12 ./bench_05_single_multi_launch "$size" 172.18.94.10 172.18.94.20 > $output_file
done
Loading

0 comments on commit 5461bd1

Please sign in to comment.