From 0f61f64ed3a90d499abf4c311c8a1bb3625d6616 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 20 Jan 2021 15:35:01 +0100 Subject: [PATCH 1/8] add timing of cpu gpu concurrency --- src/detail/exec_timer.hpp | 14 ++++++++++++++ .../lgrngn/hook_ante_delayed_step_lgrngn.hpp | 9 +++++++++ src/solvers/lgrngn/hook_ante_step_lgrngn.hpp | 4 ++++ .../lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp | 5 +++++ src/solvers/slvr_common.hpp | 2 +- src/solvers/slvr_lgrngn.hpp | 6 +++++- 6 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/detail/exec_timer.hpp b/src/detail/exec_timer.hpp index 366d7bbc54..663ddc4c55 100644 --- a/src/detail/exec_timer.hpp +++ b/src/detail/exec_timer.hpp @@ -86,6 +86,13 @@ class exec_timer : public solver_t tend_loop = parent_t::clock::now(); tloop = std::chrono::duration_cast( tend_loop - tbeg_loop ); + // calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all + typename parent_t::timer tloop_wo_diag = tloop - (trecord_all - parent_t::tasync_wait_in_record_all), + tsync_in = parent_t::tsync, + tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU) + tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu) + tcpu = tloop_wo_diag - tgpu - tcpugpu; + std::cout << "wall time in milliseconds: " << std::endl << "loop: " << tloop.count() << std::endl << " hook_ante_step: " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl @@ -102,6 +109,13 @@ class exec_timer : public solver_t << " record_all: " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl << " async_wait in record_all: " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl << " hook_post_step->hook_ante_step: " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl; + + std::cout << std::endl + << "CPU/GPU concurrency stats, only make sense for async lgrngn runs:" << std::endl + << "tloop without diag : " << tloop_wo_diag.count() << std::endl + << " pure CPU calculations without diag: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl + << " pure GPU calculations without diag: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl + << " concurrent CPU&GPU without diag: " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl; } } } diff --git a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp index 6815b2d094..f9bd2af691 100644 --- a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp @@ -3,6 +3,9 @@ #if defined(STD_FUTURE_WORKS) # include #endif +#if defined(UWLCM_TIMING) +# include "../../detail/func_time.hpp" +#endif template void slvr_lgrngn::hook_ante_delayed_step() @@ -19,7 +22,11 @@ void slvr_lgrngn::hook_ante_delayed_step() #if defined(UWLCM_TIMING) tbeg = parent_t::clock::now(); #endif +#if defined(UWLCM_TIMING) + parent_t::tsync_gpu += ftr.get(); +#else ftr.get(); +#endif #if defined(UWLCM_TIMING) tend = parent_t::clock::now(); parent_t::tsync_wait += std::chrono::duration_cast( tend - tbeg ); @@ -66,6 +73,7 @@ void slvr_lgrngn::hook_ante_delayed_step() if(params.backend == CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_async, dynamic_cast*>(prtcls.get()), params.cloudph_opts @@ -73,6 +81,7 @@ void slvr_lgrngn::hook_ante_delayed_step() else if(params.backend == multi_CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_async, dynamic_cast*>(prtcls.get()), params.cloudph_opts diff --git a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp index 4c2e57c177..dcde0fc268 100644 --- a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp @@ -20,7 +20,11 @@ void slvr_lgrngn::hook_ante_step() #if defined(UWLCM_TIMING) tbeg = parent_t::clock::now(); #endif +#if defined(UWLCM_TIMING) + parent_t::tasync_gpu += ftr.get(); +#else ftr.get(); +#endif #if defined(UWLCM_TIMING) tend = parent_t::clock::now(); parent_t::tasync_wait += std::chrono::duration_cast( tend - tbeg ); diff --git a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp index 2591c3a932..dc36316080 100644 --- a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp @@ -3,6 +3,9 @@ #if defined(STD_FUTURE_WORKS) # include #endif +#if defined(UWLCM_TIMING) +# include "../../detail/func_time.hpp" +#endif template void slvr_lgrngn::hook_mixed_rhs_ante_step() @@ -64,6 +67,7 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() if(params.backend == CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_cond, dynamic_cast*>(prtcls.get()), params.cloudph_opts, @@ -74,6 +78,7 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() else if(params.backend == multi_CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_cond, dynamic_cast*>(prtcls.get()), params.cloudph_opts, diff --git a/src/solvers/slvr_common.hpp b/src/solvers/slvr_common.hpp index 55978fa813..5ad1566b0c 100644 --- a/src/solvers/slvr_common.hpp +++ b/src/solvers/slvr_common.hpp @@ -28,7 +28,7 @@ class slvr_common : public slvr_dim using clock = std::chrono::system_clock; using timer = std::chrono::milliseconds; - timer tsync, tsync_wait, tasync, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn + timer tsync, tsync_gpu, tsync_wait, tasync, tasync_gpu, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn protected: #endif diff --git a/src/solvers/slvr_lgrngn.hpp b/src/solvers/slvr_lgrngn.hpp index 36bb8cfcff..078bcc4b88 100644 --- a/src/solvers/slvr_lgrngn.hpp +++ b/src/solvers/slvr_lgrngn.hpp @@ -129,7 +129,7 @@ class slvr_lgrngn : public std::conditional_t ftr; + std::future ftr; #endif void record_all() @@ -143,7 +143,11 @@ class slvr_lgrngn : public std::conditional_ttimestep > 0 && params.async) { assert(ftr.valid()); +#if defined(UWLCM_TIMING) + parent_t::tasync_gpu += ftr.get(); +#else ftr.get(); +#endif } #endif #if defined(UWLCM_TIMING) From 2d93bd71050a74217a5fe84746f2d893d1a628a1 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 20 Jan 2021 15:35:01 +0100 Subject: [PATCH 2/8] add timing of cpu gpu concurrency --- src/detail/exec_timer.hpp | 14 ++++++++++++++ .../lgrngn/hook_ante_delayed_step_lgrngn.hpp | 9 +++++++++ src/solvers/lgrngn/hook_ante_step_lgrngn.hpp | 4 ++++ .../lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp | 5 +++++ src/solvers/slvr_common.hpp | 2 +- src/solvers/slvr_lgrngn.hpp | 6 +++++- 6 files changed, 38 insertions(+), 2 deletions(-) diff --git a/src/detail/exec_timer.hpp b/src/detail/exec_timer.hpp index 366d7bbc54..663ddc4c55 100644 --- a/src/detail/exec_timer.hpp +++ b/src/detail/exec_timer.hpp @@ -86,6 +86,13 @@ class exec_timer : public solver_t tend_loop = parent_t::clock::now(); tloop = std::chrono::duration_cast( tend_loop - tbeg_loop ); + // calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all + typename parent_t::timer tloop_wo_diag = tloop - (trecord_all - parent_t::tasync_wait_in_record_all), + tsync_in = parent_t::tsync, + tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU) + tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu) + tcpu = tloop_wo_diag - tgpu - tcpugpu; + std::cout << "wall time in milliseconds: " << std::endl << "loop: " << tloop.count() << std::endl << " hook_ante_step: " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl @@ -102,6 +109,13 @@ class exec_timer : public solver_t << " record_all: " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl << " async_wait in record_all: " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl << " hook_post_step->hook_ante_step: " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl; + + std::cout << std::endl + << "CPU/GPU concurrency stats, only make sense for async lgrngn runs:" << std::endl + << "tloop without diag : " << tloop_wo_diag.count() << std::endl + << " pure CPU calculations without diag: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl + << " pure GPU calculations without diag: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl + << " concurrent CPU&GPU without diag: " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl; } } } diff --git a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp index 6815b2d094..132f997330 100644 --- a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp @@ -3,6 +3,9 @@ #if defined(STD_FUTURE_WORKS) # include #endif +#if defined(UWLCM_TIMING) +# include "../../detail/func_time.hpp" +#endif template void slvr_lgrngn::hook_ante_delayed_step() @@ -19,7 +22,11 @@ void slvr_lgrngn::hook_ante_delayed_step() #if defined(UWLCM_TIMING) tbeg = parent_t::clock::now(); #endif +#if defined(UWLCM_TIMING) + parent_t::tsync_gpu += ftr.get(); +#else ftr.get(); +#endif #if defined(UWLCM_TIMING) tend = parent_t::clock::now(); parent_t::tsync_wait += std::chrono::duration_cast( tend - tbeg ); @@ -66,6 +73,7 @@ void slvr_lgrngn::hook_ante_delayed_step() if(params.backend == CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_async, dynamic_cast*>(prtcls.get()), params.cloudph_opts @@ -73,6 +81,7 @@ void slvr_lgrngn::hook_ante_delayed_step() else if(params.backend == multi_CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_async, dynamic_cast*>(prtcls.get()), params.cloudph_opts diff --git a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp index 4c2e57c177..dcde0fc268 100644 --- a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp @@ -20,7 +20,11 @@ void slvr_lgrngn::hook_ante_step() #if defined(UWLCM_TIMING) tbeg = parent_t::clock::now(); #endif +#if defined(UWLCM_TIMING) + parent_t::tasync_gpu += ftr.get(); +#else ftr.get(); +#endif #if defined(UWLCM_TIMING) tend = parent_t::clock::now(); parent_t::tasync_wait += std::chrono::duration_cast( tend - tbeg ); diff --git a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp index 2591c3a932..7c2ab9febf 100644 --- a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp @@ -3,6 +3,9 @@ #if defined(STD_FUTURE_WORKS) # include #endif +#if defined(UWLCM_TIMING) +# include "../../detail/func_time.hpp" +#endif template void slvr_lgrngn::hook_mixed_rhs_ante_step() @@ -64,6 +67,7 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() if(params.backend == CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_cond, dynamic_cast*>(prtcls.get()), params.cloudph_opts, @@ -74,6 +78,7 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() else if(params.backend == multi_CUDA) ftr = std::async( std::launch::async, + func_time, &particles_t::step_cond, dynamic_cast*>(prtcls.get()), params.cloudph_opts, diff --git a/src/solvers/slvr_common.hpp b/src/solvers/slvr_common.hpp index 55978fa813..5ad1566b0c 100644 --- a/src/solvers/slvr_common.hpp +++ b/src/solvers/slvr_common.hpp @@ -28,7 +28,7 @@ class slvr_common : public slvr_dim using clock = std::chrono::system_clock; using timer = std::chrono::milliseconds; - timer tsync, tsync_wait, tasync, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn + timer tsync, tsync_gpu, tsync_wait, tasync, tasync_gpu, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn protected: #endif diff --git a/src/solvers/slvr_lgrngn.hpp b/src/solvers/slvr_lgrngn.hpp index 36bb8cfcff..078bcc4b88 100644 --- a/src/solvers/slvr_lgrngn.hpp +++ b/src/solvers/slvr_lgrngn.hpp @@ -129,7 +129,7 @@ class slvr_lgrngn : public std::conditional_t ftr; + std::future ftr; #endif void record_all() @@ -143,7 +143,11 @@ class slvr_lgrngn : public std::conditional_ttimestep > 0 && params.async) { assert(ftr.valid()); +#if defined(UWLCM_TIMING) + parent_t::tasync_gpu += ftr.get(); +#else ftr.get(); +#endif } #endif #if defined(UWLCM_TIMING) From 9df1b4dd8d6a2760a0218681d8b787f90ae1bc63 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 20 Jan 2021 15:52:39 +0100 Subject: [PATCH 3/8] track func time --- src/detail/func_time.hpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 src/detail/func_time.hpp diff --git a/src/detail/func_time.hpp b/src/detail/func_time.hpp new file mode 100644 index 0000000000..cb9f31f8ce --- /dev/null +++ b/src/detail/func_time.hpp @@ -0,0 +1,17 @@ +// function that calculates execution time of any other member function called via ptr +#pragma once + +#if defined(UWLCM_TIMING) + template + timer func_time(F func, ptr p, Args&&... args){ + timer t1=clock::now(); + p->func(std::forward(args)...); + return std::chrono::duration_cast(clock::now()-t1); + } +#else + template + timer func_time(F func, ptr p, Args&&... args){ + p->func(std::forward(args)...); + return timer(); + } +#endif From 5a9d51268e2b2051a97b90f08acb57b14c80ff94 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 21 Jan 2021 12:36:00 +0100 Subject: [PATCH 4/8] async timing launcher - pass argumets by reference --- src/detail/func_time.hpp | 79 +++++++++++++++++-- .../lgrngn/hook_ante_delayed_step_lgrngn.hpp | 12 +-- .../hook_mixed_rhs_ante_step_lgrngn.hpp | 12 +-- 3 files changed, 80 insertions(+), 23 deletions(-) diff --git a/src/detail/func_time.hpp b/src/detail/func_time.hpp index cb9f31f8ce..d39675a403 100644 --- a/src/detail/func_time.hpp +++ b/src/detail/func_time.hpp @@ -1,17 +1,86 @@ // function that calculates execution time of any other member function called via ptr #pragma once +// async_forwarder code taken from https://kholdstare.github.io/technical/2012/12/18/perfect-forwarding-to-async-2.html (C) Alexander Kondratskiy +// it is used to pass any type of reference (lvalue or rvalue) to std::async +template +class async_forwarder +{ + // Store value directly + T val_; + +public: + /** + * Move an rvalue of T into the wrapper, + * incurring no copies. + */ + async_forwarder(T&& t) + : val_(std::move(t)) { } + + // ensure no copies are made + async_forwarder(async_forwarder const& other) = delete; + + // move constructor + async_forwarder(async_forwarder&& other) + : val_(std::move(other.val_)) { } + + // Move the value out. + // Note: can only occur once! + operator T&& () { return std::move(val_); } + operator T&& () const { return std::move(val_); } +}; + +// This particular specialization +// is essentially std::ref +template +class async_forwarder +{ + T& val_; + +public: + /** + * Wrap the reference when passed an lvalue reference, + * to fool std::async + */ + async_forwarder(T& t) : val_(t) { } + + // ensure no copies are made + async_forwarder(async_forwarder const& other) = delete; + + // move constructor + async_forwarder(async_forwarder&& other) + : val_(other.val_) { } + + // User-defined conversion that automatically + // converts to the appropriate type + operator T& () { return val_; } + operator T const& () const { return val_; } +}; + + #if defined(UWLCM_TIMING) template timer func_time(F func, ptr p, Args&&... args){ - timer t1=clock::now(); - p->func(std::forward(args)...); - return std::chrono::duration_cast(clock::now()-t1); + auto t1=clock::now(); + (p->*func)(std::forward(args)...); + return std::chrono::duration_cast(clock::now()-t1); } #else template timer func_time(F func, ptr p, Args&&... args){ - p->func(std::forward(args)...); - return timer(); + (p->*func)(std::forward(args)...); + return timer(); } #endif + +template +std::future async_timing_launcher(F func, ptr p, Args&&... args) // func and p are pointers, so their copies are lightweight +{ + return std::async( + std::launch::async, + func_time, + func, + p, + async_forwarder(std::forward(args))... // ATTENTION! args are passed by reference to async + ); +} diff --git a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp index 132f997330..be1382fcec 100644 --- a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp @@ -3,9 +3,7 @@ #if defined(STD_FUTURE_WORKS) # include #endif -#if defined(UWLCM_TIMING) -# include "../../detail/func_time.hpp" -#endif +#include "../../detail/func_time.hpp" template void slvr_lgrngn::hook_ante_delayed_step() @@ -71,17 +69,13 @@ void slvr_lgrngn::hook_ante_delayed_step() { assert(!ftr.valid()); if(params.backend == CUDA) - ftr = std::async( - std::launch::async, - func_time, + ftr = async_timing_launcher( &particles_t::step_async, dynamic_cast*>(prtcls.get()), params.cloudph_opts ); else if(params.backend == multi_CUDA) - ftr = std::async( - std::launch::async, - func_time, + ftr = async_timing_launcher( &particles_t::step_async, dynamic_cast*>(prtcls.get()), params.cloudph_opts diff --git a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp index 7c2ab9febf..562c901a44 100644 --- a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp @@ -1,11 +1,9 @@ #pragma once #include "../slvr_lgrngn.hpp" +#include "../../detail/func_time.hpp" #if defined(STD_FUTURE_WORKS) # include #endif -#if defined(UWLCM_TIMING) -# include "../../detail/func_time.hpp" -#endif template void slvr_lgrngn::hook_mixed_rhs_ante_step() @@ -65,9 +63,7 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() { assert(!ftr.valid()); if(params.backend == CUDA) - ftr = std::async( - std::launch::async, - func_time, + ftr = async_timing_launcher( &particles_t::step_cond, dynamic_cast*>(prtcls.get()), params.cloudph_opts, @@ -76,9 +72,7 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() std::map >() ); else if(params.backend == multi_CUDA) - ftr = std::async( - std::launch::async, - func_time, + ftr = async_timing_launcher( &particles_t::step_cond, dynamic_cast*>(prtcls.get()), params.cloudph_opts, From dca239a4c0e98b8ae281e334b08ccf11205b1b4e Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 22 Jan 2021 15:03:04 +0100 Subject: [PATCH 5/8] record all timing fixes --- src/detail/exec_timer.hpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/detail/exec_timer.hpp b/src/detail/exec_timer.hpp index 663ddc4c55..c2527e12cd 100644 --- a/src/detail/exec_timer.hpp +++ b/src/detail/exec_timer.hpp @@ -54,7 +54,10 @@ class exec_timer : public solver_t parent_t::hook_ante_loop(nt); this->mem->barrier(); if (this->rank == 0) + { tbeg_loop = parent_t::clock::now(); + trecord_all = parent_t::timer::zero(); // reset to 0, because we only want record all done in loop, not the one in ante_loop + } this->mem->barrier(); } @@ -87,11 +90,10 @@ class exec_timer : public solver_t tloop = std::chrono::duration_cast( tend_loop - tbeg_loop ); // calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all - typename parent_t::timer tloop_wo_diag = tloop - (trecord_all - parent_t::tasync_wait_in_record_all), - tsync_in = parent_t::tsync, + typename parent_t::timer tsync_in = parent_t::tsync, tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU) tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu) - tcpu = tloop_wo_diag - tgpu - tcpugpu; + tcpu = tloop - tgpu - tcpugpu; std::cout << "wall time in milliseconds: " << std::endl << "loop: " << tloop.count() << std::endl @@ -106,16 +108,18 @@ class exec_timer : public solver_t << " delayed step: " << thads_hps.count() << " ("<< setup::real_t(thads_hps.count())/tloop.count()*100 <<"%)" << std::endl << " hook_post_step: " << thps.count() << " ("<< setup::real_t(thps.count())/tloop.count()*100 <<"%)" << std::endl << " hook_mixed_rhs_post_step: " << thmps.count() << " ("<< setup::real_t(thmps.count())/tloop.count()*100 <<"%)" << std::endl - << " record_all: " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl + << " record_all (in loop): " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl << " async_wait in record_all: " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl << " hook_post_step->hook_ante_step: " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl; std::cout << std::endl - << "CPU/GPU concurrency stats, only make sense for async lgrngn runs:" << std::endl - << "tloop without diag : " << tloop_wo_diag.count() << std::endl - << " pure CPU calculations without diag: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl - << " pure GPU calculations without diag: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl - << " concurrent CPU&GPU without diag: " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop_wo_diag.count()*100 <<"%)" << std::endl; + << "CPU/GPU concurrency stats, only make sense for async lgrngn runs" << std::endl + << "and does not take into account GPU time in record_all, so most accurate without diag:" << std::endl + << " pure CPU calculations: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop.count()*100 <<"%)" << std::endl + << " pure GPU calculations: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop.count()*100 <<"%)" << std::endl + << " concurrent CPU&GPU: " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop.count()*100 <<"%)" << std::endl + << " tsync_gpu: " << parent_t::tsync_gpu.count() << " ("<< setup::real_t(parent_t::tsync_gpu.count())/tloop.count()*100 <<"%)" << std::endl + << " tasync_gpu: " << parent_t::tasync_gpu.count() << " ("<< setup::real_t(parent_t::tasync_gpu.count())/tloop.count()*100 <<"%)" << std::endl; } } } From d5722fb42e4c6be6cab8ef2a18c61687d74048d2 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Mon, 25 Jan 2021 13:11:58 +0100 Subject: [PATCH 6/8] lgrngn: temp eulerian rc field for more concurrency --- src/detail/exec_timer.hpp | 2 +- .../lgrngn/hook_ante_delayed_step_lgrngn.hpp | 13 ++++++++- src/solvers/lgrngn/hook_ante_step_lgrngn.hpp | 26 ----------------- .../hook_mixed_rhs_ante_step_lgrngn.hpp | 23 +++++++++++++++ src/solvers/slvr_lgrngn.hpp | 29 ++++++++++++------- 5 files changed, 54 insertions(+), 39 deletions(-) diff --git a/src/detail/exec_timer.hpp b/src/detail/exec_timer.hpp index c2527e12cd..1ba212fd67 100644 --- a/src/detail/exec_timer.hpp +++ b/src/detail/exec_timer.hpp @@ -98,8 +98,8 @@ class exec_timer : public solver_t std::cout << "wall time in milliseconds: " << std::endl << "loop: " << tloop.count() << std::endl << " hook_ante_step: " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl - << " async_wait: " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl << " hook_mixed_rhs_ante_step: " << thmas.count() << " ("<< setup::real_t(thmas.count())/tloop.count()*100 <<"%)" << std::endl + << " async_wait: " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl << " sync: " << parent_t::tsync.count() << " ("<< setup::real_t(parent_t::tsync.count())/tloop.count()*100 <<"%)" << std::endl << " step: " << thas_hads.count() << " ("<< setup::real_t(thas_hads.count())/tloop.count()*100 <<"%)" << std::endl << " hook_ante_delayed_step: " << thads.count() << " ("<< setup::real_t(thads.count())/tloop.count()*100 <<"%)" << std::endl diff --git a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp index be1382fcec..6c854057bd 100644 --- a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp @@ -53,6 +53,8 @@ void slvr_lgrngn::hook_ante_delayed_step() // store liquid water content (post-cond, pre-adve and pre-subsidence) diag_rl(); + if(ct_params_t::sgs_scheme == libmpdataxx::solvers::smg) + diag_rc(); if (this->rank == 0) { @@ -109,9 +111,18 @@ void slvr_lgrngn::hook_ante_delayed_step() this->vert_grad_cnt(tmp1, F, params.dz); F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx); r_l(ijk) += F(ijk) * this->dt; + + tmp1(ijk) = r_c(ijk); + // fill halos for gradient calculation + // TODO: no need to xchng in horizontal, which potentially causes MPI communication + this->xchng_sclr(tmp1, this->ijk, this->halo); + this->vert_grad_cnt(tmp1, F, params.dz); + F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx); + r_c(ijk) += F(ijk) * this->dt; } - // advect r_l (1st-order) + // advect r_l and r_c (1st-order) this->self_advec_donorcell(this->r_l); + this->self_advec_donorcell(this->r_c); negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv at the end of ante delayed step"); } diff --git a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp index dcde0fc268..04503bffa6 100644 --- a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp @@ -7,32 +7,6 @@ template void slvr_lgrngn::hook_ante_step() { - if (this->rank == 0) - { - // assuring previous async step finished ... -#if defined(STD_FUTURE_WORKS) - if ( - params.async && - this->timestep != 0 && // ... but not in first timestep ... - ((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step - ) { - assert(ftr.valid()); -#if defined(UWLCM_TIMING) - tbeg = parent_t::clock::now(); -#endif -#if defined(UWLCM_TIMING) - parent_t::tasync_gpu += ftr.get(); -#else - ftr.get(); -#endif -#if defined(UWLCM_TIMING) - tend = parent_t::clock::now(); - parent_t::tasync_wait += std::chrono::duration_cast( tend - tbeg ); -#endif - } else assert(!ftr.valid()); -#endif - } - this->mem->barrier(); parent_t::hook_ante_step(); // includes RHS, which in turn launches sync_in and step_cond negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv after at the end of hook_ante_step"); } diff --git a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp index 562c901a44..0e8ee667fc 100644 --- a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp +++ b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp @@ -34,6 +34,29 @@ void slvr_lgrngn::hook_mixed_rhs_ante_step() Cz.reindex(this->zero) /= (*params.rhod)(this->vert_idx); // TODO: should be interpolated, since theres a shift between positions of rhod and Cz } + // assuring previous async step finished ... +#if defined(STD_FUTURE_WORKS) + if ( + params.async && + this->timestep != 0 && // ... but not in first timestep ... + ((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step + ) { + assert(ftr.valid()); +#if defined(UWLCM_TIMING) + tbeg = parent_t::clock::now(); +#endif +#if defined(UWLCM_TIMING) + parent_t::tasync_gpu += ftr.get(); +#else + ftr.get(); +#endif +#if defined(UWLCM_TIMING) + tend = parent_t::clock::now(); + parent_t::tasync_wait += std::chrono::duration_cast( tend - tbeg ); +#endif + } else assert(!ftr.valid()); +#endif + // start synchronous stuff timer #if defined(UWLCM_TIMING) tbeg = parent_t::clock::now(); diff --git a/src/solvers/slvr_lgrngn.hpp b/src/solvers/slvr_lgrngn.hpp index 078bcc4b88..cab759b26e 100644 --- a/src/solvers/slvr_lgrngn.hpp +++ b/src/solvers/slvr_lgrngn.hpp @@ -37,7 +37,8 @@ class slvr_lgrngn : public std::conditional_trank == 0) { prtcls->diag_wet_rng(.5e-6, 25.e-6); prtcls->diag_wet_mom(3); - auto rc = tmp(this->domain); + auto rc = r_c(this->domain); rc = typename parent_t::arr_t(prtcls->outbuf(), rc.shape(), blitz::duplicateData); } this->mem->barrier(); - nancheck(tmp(this->ijk), "tmp after copying from diag_wet_mom(3) in get_rc"); - tmp(this->ijk) *= 4./3. * 1000. * 3.14159; // get mixing ratio [kg/kg] + nancheck(r_c(this->ijk), "r_c after copying from diag_wet_mom(3) in diag_rc"); + r_c(this->ijk) *= 4./3. * 1000. * 3.14159; // get mixing ratio [kg/kg] this->mem->barrier(); - this->avg_edge_sclr(tmp, this->ijk); // in case of cyclic bcond, rc on edges needs to be the same - - this->mem->barrier(); - return tmp; + this->avg_edge_sclr(r_c, this->ijk); // in case of cyclic bcond, rc on edges needs to be the same + } + + virtual typename parent_t::arr_t get_rc(typename parent_t::arr_t& tmp) final + { + return r_c; } void hook_ante_loop(int nt); @@ -126,6 +129,8 @@ class slvr_lgrngn : public std::conditional_ttmp[__FILE__][0][0]), rv_post_cond(args.mem->tmp[__FILE__][0][1]), th_pre_cond(args.mem->tmp[__FILE__][0][2]), - th_post_cond(args.mem->tmp[__FILE__][0][3]) + th_post_cond(args.mem->tmp[__FILE__][0][3]), + r_c(args.mem->tmp[__FILE__][1][0]) { - + r_c = 0.; // TODO: equip rank() in libmpdata with an assert() checking if not in serial block } @@ -199,6 +205,7 @@ class slvr_lgrngn : public std::conditional_t Date: Wed, 14 Apr 2021 11:07:57 +0200 Subject: [PATCH 7/8] fix timing of hps->has --- src/detail/exec_timer.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/detail/exec_timer.hpp b/src/detail/exec_timer.hpp index 1ba212fd67..15d9dc8a54 100644 --- a/src/detail/exec_timer.hpp +++ b/src/detail/exec_timer.hpp @@ -57,6 +57,7 @@ class exec_timer : public solver_t { tbeg_loop = parent_t::clock::now(); trecord_all = parent_t::timer::zero(); // reset to 0, because we only want record all done in loop, not the one in ante_loop + tbeg_step = parent_t::clock::now(); // init tbeg_step, TODO: better to do it in the first call to get_step_time } this->mem->barrier(); } From 4e87379ca0e51f16d2e89364ad02df48e42586f6 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 14 Apr 2021 11:08:49 +0200 Subject: [PATCH 8/8] moist thermal case with horizontal velocity --- src/cases/MoistThermalGrabowskiClark99.hpp | 55 +++++++++++++++++++++- src/run_hlpr.cpp | 2 + 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/src/cases/MoistThermalGrabowskiClark99.hpp b/src/cases/MoistThermalGrabowskiClark99.hpp index 298e16f4df..fb05a5c989 100644 --- a/src/cases/MoistThermalGrabowskiClark99.hpp +++ b/src/cases/MoistThermalGrabowskiClark99.hpp @@ -31,7 +31,6 @@ namespace setup using libcloudphxx::common::const_cp::p_vs; using libcloudphxx::common::theta_std::p_1000; - // RH T and p to rv assuming RH = r_v / r_vs inline quantity RH_T_p_to_rv(const real_t &RH, const quantity &T, const quantity &p) { @@ -77,6 +76,17 @@ namespace setup const real_t z_abs = 125000; // [m] height above which absorber works, no absorber + // westerly wind + struct u + { + real_t operator()(const real_t &z) const + { + return 5 * cos(real_t((z * si::metres) / Z) * M_PI); + } + BZ_DECLARE_FUNCTOR(u); + }; + + ///WARNING: these functors, taken from Clark Farley 1984, are for dry air!! struct th_std_fctr @@ -339,6 +349,7 @@ namespace setup params.dz = params.dj; } + protected: // function expecting a libmpdata++ solver as argument void intcond(typename parent_t::concurr_any_t &solver, arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed) @@ -387,6 +398,7 @@ namespace setup params.dz = params.dk; } + protected: // function expecting a libmpdata++ solver as argument void intcond(typename parent_t::concurr_any_t &solver, arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed) @@ -423,5 +435,46 @@ namespace setup this->Y = Y; } }; + + template + class MoistThermalGrabowskiClark99_horvel; + + template + class MoistThermalGrabowskiClark99_horvel : public MoistThermalGrabowskiClark99 + { + using parent_t = MoistThermalGrabowskiClark99; + using ix = typename case_ct_params_t::ix; + + // function expecting a libmpdata++ solver as argument + void intcond(typename parent_t::concurr_any_t &solver, + arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed) + { + parent_t::intcond(solver, rhod, th_e, rv_e, rl_e, p_e, rng_seed); + blitz::secondIndex k; + int nz = solver.advectee_global().extent(ix::w); + real_t dz = (Z / si::metres) / (nz-1); + solver.advectee(ix::u)= u()(k * dz); + solver.vab_relaxed_state(0) = solver.advectee(ix::u); + } + }; + + template + class MoistThermalGrabowskiClark99_horvel : public MoistThermalGrabowskiClark99 + { + using parent_t = MoistThermalGrabowskiClark99; + using ix = typename case_ct_params_t::ix; + + // function expecting a libmpdata++ solver as argument + void intcond(typename parent_t::concurr_any_t &solver, + arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed) + { + parent_t::intcond(solver, rhod, th_e, rv_e, rl_e, p_e, rng_seed); + blitz::thirdIndex k; + int nz = solver.advectee_global().extent(ix::w); + real_t dz = (Z / si::metres) / (nz-1); + solver.advectee(ix::u)= u()(k * dz); + solver.vab_relaxed_state(0) = solver.advectee(ix::u); + } + }; }; }; diff --git a/src/run_hlpr.cpp b/src/run_hlpr.cpp index 7aed20cc1e..6d95ae93f7 100644 --- a/src/run_hlpr.cpp +++ b/src/run_hlpr.cpp @@ -89,6 +89,8 @@ void run(const int (&nps)[n_dims], const user_params_t &user_params) // setup choice if (user_params.model_case == "moist_thermal") case_ptr.reset(new setup::moist_thermal::MoistThermalGrabowskiClark99()); + else if (user_params.model_case == "moist_thermal_horvel") + case_ptr.reset(new setup::moist_thermal::MoistThermalGrabowskiClark99_horvel()); else if (user_params.model_case == "dry_thermal") case_ptr.reset(new setup::dry_thermal::DryThermal()); else if (user_params.model_case == "dycoms_rf01")