-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #28 from TheCodeinator/20-use-chrono-for-microbenc…
…hmarks WIP:20 use chrono for microbenchmarks
- Loading branch information
Showing
45 changed files
with
22,566 additions
and
125 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
project(bench_05_single_multi_launch_simple LANGUAGES CXX CUDA) | ||
|
||
add_executable(bench_05_single_multi_launch_simple single_multi_launch_simple.cu) | ||
|
||
add_dependencies(bench_05_single_multi_launch_simple nvshmem nvshmem-db) | ||
target_link_libraries(bench_05_single_multi_launch_simple nvshmem nvshmem-db) | ||
|
||
set_property(TARGET bench_05_single_multi_launch_simple PROPERTY POSITION_INDEPENDENT_CODE ON) | ||
|
||
set_target_properties(bench_05_single_multi_launch_simple PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
|
||
launches=(2 4 8 16 32 64 128) | ||
|
||
output_file="results.csv" | ||
rm -f $output_file | ||
touch $output_file | ||
echo "type,launches,time_single,time_multi" > $output_file | ||
|
||
for l in "${launches[@]}"; do | ||
echo "Running for $l launches" | ||
./bench_05_single_multi_launch_simple "$l" >> $output_file | ||
done |
122 changes: 122 additions & 0 deletions
122
benchmarks/05_1_single_multi_launch_simple/single_multi_launch_simple.cu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#include <cuda_runtime.h> | ||
#include <iostream> | ||
#include <fstream> | ||
#include <chrono> | ||
#include <vector> | ||
#include <string> | ||
#include <assert.h> | ||
#include "Macros.cuh" | ||
|
||
__constant__ uint32_t work_size = 1000; | ||
|
||
|
||
enum class OccupancyMode { | ||
SLEEP = 0, | ||
LOOP = 1 | ||
}; | ||
|
||
/* | ||
Short running | ||
*/ | ||
template<OccupancyMode occupancy> | ||
__global__ void calculate(size_t num_launches, int* res, double* approx) { | ||
|
||
if constexpr (occupancy == OccupancyMode::SLEEP) { | ||
//c.f. calculate_long | ||
__nanosleep(1000000000U); | ||
*res += 1; | ||
} | ||
else if constexpr (occupancy == OccupancyMode::LOOP){ | ||
// Approximate pi/4 https://en.wikipedia.org/wiki/Leibniz_formula_for_π | ||
for(uint32_t i {0}; i<work_size*num_launches; i++){ | ||
*approx += pow((-1),i)/(2*i+1); | ||
} | ||
*res += 1; | ||
} | ||
|
||
} | ||
|
||
/* | ||
Long running kernel over whole domain | ||
*/ | ||
template<OccupancyMode occupancy> | ||
__global__ void calculate_parts(size_t num_launches, int* res, double* approx) { | ||
|
||
if constexpr (occupancy == OccupancyMode::SLEEP) { | ||
// Compute capability >= 7.0 (V100) | ||
__nanosleep(100 / num_launches); | ||
*res += 1; | ||
} | ||
else if constexpr (occupancy == OccupancyMode::LOOP){ | ||
// Approximate pi/4 https://en.wikipedia.org/wiki/Leibniz_formula_for_π | ||
for(uint32_t i {0}; i<work_size; i++){ | ||
*approx += pow((-1),i)/(2*i+1); | ||
} | ||
*res += 1; | ||
} | ||
} | ||
|
||
// args: | ||
// 1: num_launches | ||
int main(int argc, char *argv[]) { | ||
|
||
assert(argc == 2); | ||
const uint32_t num_launches = std::stoull(argv[1]); | ||
|
||
CUDA_CHECK(cudaSetDevice(0)); | ||
cudaStream_t stream1; | ||
CUDA_CHECK(cudaStreamCreate(&stream1)); | ||
|
||
int* res; | ||
double* approx; | ||
CUDA_CHECK(cudaMalloc((void**)&res, sizeof(int))); | ||
CUDA_CHECK(cudaMalloc((void**)&approx, sizeof(double))); | ||
CUDA_CHECK(cudaMemset(res, 0, sizeof(int))); | ||
CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double))); | ||
|
||
// Warm up CUDA context | ||
calculate<OccupancyMode::SLEEP><<<1,1,0,stream1>>>(num_launches,res, approx); | ||
cudaStreamSynchronize(stream1); | ||
|
||
CUDA_CHECK(cudaMemset(res, 0, sizeof(int))); | ||
CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double))); | ||
|
||
auto start = std::chrono::steady_clock::now(); | ||
|
||
calculate<OccupancyMode::LOOP><<<1,1,0,stream1>>>(num_launches,res, approx); | ||
cudaStreamSynchronize(stream1); | ||
|
||
auto stop = std::chrono::steady_clock::now(); | ||
|
||
int* host_res = reinterpret_cast<int*>(malloc(sizeof(int))); | ||
double* host_approx = reinterpret_cast<double*>(malloc(sizeof(double))); | ||
CUDA_CHECK(cudaMemcpy(host_res, res, sizeof(int), cudaMemcpyDeviceToHost)); | ||
CUDA_CHECK(cudaMemcpy(host_approx, approx, sizeof(double), cudaMemcpyDeviceToHost)); | ||
assert(*host_res == 1); | ||
|
||
auto dur = stop-start; | ||
auto t_ms = dur.count() * 1e-6; | ||
|
||
CUDA_CHECK(cudaMemset(res, 0, sizeof(int))); | ||
CUDA_CHECK(cudaMemset(approx, 0.0, sizeof(double))); | ||
|
||
auto start2 = std::chrono::steady_clock::now(); | ||
|
||
for(int i{0}; i<num_launches;i++) { | ||
calculate_parts<OccupancyMode::LOOP><<<1, 1, 0, stream1>>>(num_launches, res, approx); | ||
cudaStreamSynchronize(stream1); | ||
} | ||
|
||
auto stop2 = std::chrono::steady_clock::now(); | ||
|
||
CUDA_CHECK(cudaMemcpy(host_res, res, sizeof(int), cudaMemcpyDeviceToHost)); | ||
CUDA_CHECK(cudaMemcpy(host_approx, approx, sizeof(double), cudaMemcpyDeviceToHost)); | ||
assert(*host_res == num_launches); | ||
|
||
auto dur2 = stop2 - start2; | ||
auto t_ms2 = dur2.count() * 1e-6; | ||
|
||
std::cout << "05_single_multi_launch_simple" << "," << num_launches << "," << t_ms << "," << t_ms2 << std::endl; | ||
|
||
return EXIT_SUCCESS; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,10 @@ | ||
project(bench_05_single_multi_launch LANGUAGES CXX CUDA) | ||
|
||
add_executable(bench_05_single_multi_launch single_multi_launch.cu) | ||
|
||
add_dependencies(bench_05_single_multi_launch nvshmem nvshmem-db rdmapp) | ||
target_link_libraries(bench_05_single_multi_launch nvshmem nvshmem-db rdmapp) | ||
|
||
set_property(TARGET bench_05_single_multi_launch PROPERTY POSITION_INDEPENDENT_CODE ON) | ||
|
||
set_target_properties(bench_05_single_multi_launch PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,19 @@ | ||
#!/bin/bash | ||
|
||
input_size = (1000,10000,100000,1000000) | ||
# disable communication over NVLINK or PCI | ||
export PATH=$PATH:/opt/hydra/bin | ||
export NVSHMEM_DISABLE_P2P=true | ||
|
||
#input_size=(1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304) | ||
input_size=(1048576,2097152,4194304,8388608,16777216,33554432,67108864,134217728) | ||
|
||
output_file="results.csv" | ||
rm -f $output_file | ||
touch $output_file | ||
echo "type,num_bytes,launches,time_nvshmem,time_rdma" > $output_file | ||
echo "type,num_bytes,num_bytes_buffer,launches,time_nvshmem,time_rdma" > $output_file | ||
|
||
for size in "${input_size[@]}"; do | ||
echo "Running for input size $size" | ||
nvshmrun -np 2 ./bench_05_single_multi_launch $size 172.18.94.10 172.18.94.11 > $output_file | ||
# for each node x ip for ib y is 172.18.94.xy | ||
nvshmrun -n 2 -ppn 1 --hosts 10.0.2.11,10.0.2.12 ./bench_05_single_multi_launch "$size" 172.18.94.10 172.18.94.20 > $output_file | ||
done |
Oops, something went wrong.