igfuw · pdziekan · Jan 20, 2021 · Jan 20, 2021 · Jan 20, 2021 · Jan 20, 2021
diff --git a/src/cases/MoistThermalGrabowskiClark99.hpp b/src/cases/MoistThermalGrabowskiClark99.hpp
@@ -31,7 +31,6 @@ namespace setup
     using libcloudphxx::common::const_cp::p_vs;
     using libcloudphxx::common::theta_std::p_1000;
 
-
     // RH T and p to rv assuming RH = r_v / r_vs
     inline quantity<si::dimensionless, real_t> RH_T_p_to_rv(const real_t &RH, const quantity<si::temperature, real_t> &T, const quantity<si::pressure, real_t> &p)
     {
@@ -77,6 +76,17 @@ namespace setup
 
     const real_t z_abs = 125000; // [m] height above which absorber works, no absorber
 
+    // westerly wind
+    struct u
+    {
+      real_t operator()(const real_t &z) const
+      {
+        return 5 * cos(real_t((z * si::metres) / Z) * M_PI);
+      }
+      BZ_DECLARE_FUNCTOR(u);
+    };
+
+
 ///WARNING: these functors, taken from Clark Farley 1984, are for dry air!!
 
     struct th_std_fctr
@@ -339,6 +349,7 @@ namespace setup
         params.dz = params.dj;
       }
 
+      protected:
       // function expecting a libmpdata++ solver as argument
       void intcond(typename parent_t::concurr_any_t &solver,
                    arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
@@ -387,6 +398,7 @@ namespace setup
         params.dz = params.dk;
       }
 
+      protected:
       // function expecting a libmpdata++ solver as argument
       void intcond(typename parent_t::concurr_any_t &solver,
                    arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
@@ -423,5 +435,46 @@ namespace setup
         this->Y = Y;
       }
     };
+
+    template<class case_ct_params_t, int n_dims>
+    class MoistThermalGrabowskiClark99_horvel;
+
+    template<class case_ct_params_t>
+    class MoistThermalGrabowskiClark99_horvel<case_ct_params_t, 2> : public MoistThermalGrabowskiClark99<case_ct_params_t, 2>
+    {
+      using parent_t = MoistThermalGrabowskiClark99<case_ct_params_t, 2>;
+      using ix = typename case_ct_params_t::ix;
+
+      // function expecting a libmpdata++ solver as argument
+      void intcond(typename parent_t::concurr_any_t &solver,
+                   arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
+      {
+        parent_t::intcond(solver, rhod, th_e, rv_e, rl_e, p_e, rng_seed);
+        blitz::secondIndex k;
+        int nz = solver.advectee_global().extent(ix::w);
+        real_t dz = (Z / si::metres) / (nz-1); 
+        solver.advectee(ix::u)= u()(k * dz);
+        solver.vab_relaxed_state(0) = solver.advectee(ix::u);
+      }
+    };
+
+    template<class case_ct_params_t>
+    class MoistThermalGrabowskiClark99_horvel<case_ct_params_t, 3> : public MoistThermalGrabowskiClark99<case_ct_params_t, 3>
+    {
+      using parent_t = MoistThermalGrabowskiClark99<case_ct_params_t, 3>;
+      using ix = typename case_ct_params_t::ix;
+
+      // function expecting a libmpdata++ solver as argument
+      void intcond(typename parent_t::concurr_any_t &solver,
+                   arr_1D_t &rhod, arr_1D_t &th_e, arr_1D_t &rv_e, arr_1D_t &rl_e, arr_1D_t &p_e, int rng_seed)
+      {
+        parent_t::intcond(solver, rhod, th_e, rv_e, rl_e, p_e, rng_seed);
+        blitz::thirdIndex k;
+        int nz = solver.advectee_global().extent(ix::w);
+        real_t dz = (Z / si::metres) / (nz-1); 
+        solver.advectee(ix::u)= u()(k * dz);
+        solver.vab_relaxed_state(0) = solver.advectee(ix::u);
+      }
+    };
   };
 };
diff --git a/src/detail/exec_timer.hpp b/src/detail/exec_timer.hpp
@@ -54,7 +54,11 @@ class exec_timer : public solver_t
     parent_t::hook_ante_loop(nt);
     this->mem->barrier();
     if (this->rank == 0) 
+    {   
       tbeg_loop = parent_t::clock::now();
+      trecord_all = parent_t::timer::zero(); // reset to 0, because we only want record all done in loop, not the one in ante_loop 
+      tbeg_step = parent_t::clock::now(); // init tbeg_step, TODO: better to do it in the first call to get_step_time
+    }
     this->mem->barrier();
   }
 
@@ -86,11 +90,17 @@ class exec_timer : public solver_t
         tend_loop = parent_t::clock::now();
         tloop = std::chrono::duration_cast<std::chrono::milliseconds>( tend_loop - tbeg_loop );
 
+        // calculate CPU/GPU times and concurrency, valid only for async runs and not taking into account diagnostics in record_all
+        typename parent_t::timer tsync_in = parent_t::tsync,
+                                 tgpu = parent_t::tasync_wait_in_record_all + parent_t::tsync_wait + parent_t::tasync_wait + tsync_in, // time of pure GPU calculations (= wait time of CPU)
+                                 tcpugpu = tsync_in + parent_t::tasync_gpu + parent_t::tsync_gpu - tgpu, // time of concurrent CPU and GPU calculations (= total time of GPU calculations - tgpu)
+                                 tcpu = tloop - tgpu - tcpugpu;
+
         std::cout <<  "wall time in milliseconds: " << std::endl
           << "loop:                            " << tloop.count() << std::endl
           << "  hook_ante_step:                  " << thas.count() << " ("<< setup::real_t(thas.count())/tloop.count()*100 <<"%)" << std::endl
-          << "    async_wait:                      " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
           << "    hook_mixed_rhs_ante_step:        " << thmas.count() << " ("<< setup::real_t(thmas.count())/tloop.count()*100 <<"%)" << std::endl
+          << "      async_wait:                    " << parent_t::tasync_wait.count() << " ("<< setup::real_t(parent_t::tasync_wait.count())/tloop.count()*100 <<"%)" << std::endl
           << "      sync:                            " << parent_t::tsync.count() << " ("<< setup::real_t(parent_t::tsync.count())/tloop.count()*100 <<"%)" << std::endl
           << "  step:                            " << thas_hads.count() << " ("<< setup::real_t(thas_hads.count())/tloop.count()*100 <<"%)" << std::endl
           << "  hook_ante_delayed_step:          " << thads.count() << " ("<< setup::real_t(thads.count())/tloop.count()*100 <<"%)" << std::endl
@@ -99,9 +109,18 @@ class exec_timer : public solver_t
           << "  delayed step:                    " << thads_hps.count() << " ("<< setup::real_t(thads_hps.count())/tloop.count()*100 <<"%)" << std::endl
           << "  hook_post_step:                  " << thps.count() << " ("<< setup::real_t(thps.count())/tloop.count()*100 <<"%)" << std::endl
           << "    hook_mixed_rhs_post_step:        " << thmps.count() << " ("<< setup::real_t(thmps.count())/tloop.count()*100 <<"%)" << std::endl
-          << "    record_all:                      " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
+          << "    record_all (in loop):            " << trecord_all.count() << " ("<< setup::real_t(trecord_all.count())/tloop.count()*100 <<"%)" << std::endl
           << "      async_wait in record_all:      " << parent_t::tasync_wait_in_record_all.count() << " ("<< setup::real_t(parent_t::tasync_wait_in_record_all.count())/tloop.count()*100 <<"%)" << std::endl
           << "  hook_post_step->hook_ante_step:  " << thps_has.count() << " ("<< setup::real_t(thps_has.count())/tloop.count()*100 <<"%)" << std::endl;
+
+          std::cout << std::endl
+          << "CPU/GPU concurrency stats, only make sense for async lgrngn runs" << std::endl
+          << "and does not take into account GPU time in record_all, so most accurate without diag:" << std::endl
+          << "  pure CPU calculations: " << tcpu.count() << " ("<< setup::real_t(tcpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  pure GPU calculations: " << tgpu.count() << " ("<< setup::real_t(tgpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  concurrent CPU&GPU:    " << tcpugpu.count() << " ("<< setup::real_t(tcpugpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  tsync_gpu:  " << parent_t::tsync_gpu.count() << " ("<< setup::real_t(parent_t::tsync_gpu.count())/tloop.count()*100 <<"%)" << std::endl
+          << "  tasync_gpu: " << parent_t::tasync_gpu.count() << " ("<< setup::real_t(parent_t::tasync_gpu.count())/tloop.count()*100 <<"%)" << std::endl;
       }
     }
   }

diff --git a/src/detail/func_time.hpp b/src/detail/func_time.hpp
@@ -0,0 +1,86 @@
+// function that calculates execution time of any other member function called via ptr
+#pragma once
+
+// async_forwarder code taken from https://kholdstare.github.io/technical/2012/12/18/perfect-forwarding-to-async-2.html (C) Alexander Kondratskiy
+// it is used to pass any type of reference (lvalue or rvalue) to std::async
+template <typename T>
+class async_forwarder
+{
+    // Store value directly
+    T val_;
+
+public:
+    /**
+     * Move an rvalue of T into the wrapper,
+     * incurring no copies.
+     */
+    async_forwarder(T&& t) 
+     : val_(std::move(t)) { }
+
+    // ensure no copies are made
+    async_forwarder(async_forwarder const& other) = delete;
+
+    // move constructor
+    async_forwarder(async_forwarder&& other)
+        : val_(std::move(other.val_)) { }
+
+    // Move the value out.
+    // Note: can only occur once!
+    operator T&& ()       { return std::move(val_); }
+    operator T&& () const { return std::move(val_); }
+};
+
+// This particular specialization
+// is essentially std::ref
+template <typename T>
+class async_forwarder<T&>
+{
+    T& val_;
+
+public:
+    /**
+     * Wrap the reference when passed an lvalue reference,
+     * to fool std::async
+     */
+    async_forwarder(T& t) : val_(t) { }
+
+    // ensure no copies are made
+    async_forwarder(async_forwarder const& other) = delete;
+
+    // move constructor
+    async_forwarder(async_forwarder&& other)
+        : val_(other.val_) { }
+
+    // User-defined conversion that automatically
+    // converts to the appropriate type
+    operator T&       ()       { return val_; }
+    operator T const& () const { return val_; }
+};
+
+
+#if defined(UWLCM_TIMING)
+  template<class clock, class timer, class F, class ptr, typename... Args>
+  timer func_time(F func, ptr p, Args&&... args){
+    auto t1=clock::now();
+    (p->*func)(std::forward<Args>(args)...);
+    return std::chrono::duration_cast<timer>(clock::now()-t1);
+  }
+#else
+  template<class clock, class timer, class F, class ptr, typename... Args>
+  timer func_time(F func, ptr p, Args&&... args){
+    (p->*func)(std::forward<Args>(args)...);
+    return timer();
+  }
+#endif
+
+template<class clock, class timer, class F, class ptr, typename... Args>
+std::future<timer> async_timing_launcher(F func, ptr p, Args&&... args) // func and p are pointers, so their copies are lightweight
+{
+  return std::async(
+           std::launch::async,
+           func_time<clock, timer, F, ptr, Args...>,
+           func, 
+           p,
+           async_forwarder<Args>(std::forward<Args>(args))... // ATTENTION! args are passed by reference to async
+         );
+}
diff --git a/src/run_hlpr.cpp b/src/run_hlpr.cpp
@@ -89,6 +89,8 @@ void run(const int (&nps)[n_dims], const user_params_t &user_params)
   // setup choice
   if (user_params.model_case == "moist_thermal")
     case_ptr.reset(new setup::moist_thermal::MoistThermalGrabowskiClark99<case_ct_params_t, n_dims>()); 
+  else if (user_params.model_case == "moist_thermal_horvel")
+    case_ptr.reset(new setup::moist_thermal::MoistThermalGrabowskiClark99_horvel<case_ct_params_t, n_dims>()); 
   else if (user_params.model_case == "dry_thermal")
     case_ptr.reset(new setup::dry_thermal::DryThermal<case_ct_params_t, n_dims>()); 
   else if (user_params.model_case == "dycoms_rf01")

diff --git a/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_delayed_step_lgrngn.hpp
@@ -3,6 +3,7 @@
 #if defined(STD_FUTURE_WORKS)
 #  include <future>
 #endif
+#include "../../detail/func_time.hpp"
 
 template <class ct_params_t>
 void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
@@ -19,7 +20,11 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
 #if defined(UWLCM_TIMING)
       tbeg = parent_t::clock::now();
 #endif
+#if defined(UWLCM_TIMING)
+      parent_t::tsync_gpu += ftr.get();
+#else
       ftr.get();
+#endif
 #if defined(UWLCM_TIMING)
       tend = parent_t::clock::now();
       parent_t::tsync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
@@ -48,6 +53,8 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
 
   // store liquid water content (post-cond, pre-adve and pre-subsidence)
   diag_rl();
+  if(ct_params_t::sgs_scheme == libmpdataxx::solvers::smg)
+    diag_rc();
 
   if (this->rank == 0) 
   {
@@ -64,15 +71,13 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
       {
         assert(!ftr.valid());
         if(params.backend == CUDA)
-          ftr = std::async(
-            std::launch::async, 
+          ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
             &particles_t<real_t, CUDA>::step_async, 
             dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
             params.cloudph_opts
           );
         else if(params.backend == multi_CUDA)
-          ftr = std::async(
-            std::launch::async, 
+          ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
             &particles_t<real_t, multi_CUDA>::step_async, 
             dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
             params.cloudph_opts
@@ -106,9 +111,18 @@ void slvr_lgrngn<ct_params_t>::hook_ante_delayed_step()
     this->vert_grad_cnt(tmp1, F, params.dz);
     F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
     r_l(ijk) += F(ijk) * this->dt;
+
+    tmp1(ijk) = r_c(ijk);
+    // fill halos for gradient calculation
+    // TODO: no need to xchng in horizontal, which potentially causes MPI communication
+    this->xchng_sclr(tmp1, this->ijk, this->halo);
+    this->vert_grad_cnt(tmp1, F, params.dz);
+    F(ijk).reindex(this->zero) *= - (*params.w_LS)(this->vert_idx);
+    r_c(ijk) += F(ijk) * this->dt;
   }
 
-  // advect r_l (1st-order)
+  // advect r_l and r_c (1st-order)
   this->self_advec_donorcell(this->r_l);
+  this->self_advec_donorcell(this->r_c);
   negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv at the end of ante delayed step");
 }
diff --git a/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_ante_step_lgrngn.hpp
@@ -7,28 +7,6 @@
 template <class ct_params_t>
 void slvr_lgrngn<ct_params_t>::hook_ante_step()
 {
-  if (this->rank == 0) 
-  {
-    // assuring previous async step finished ...
-#if defined(STD_FUTURE_WORKS)
-    if (
-      params.async && 
-      this->timestep != 0 && // ... but not in first timestep ...
-      ((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
-    ) {
-      assert(ftr.valid());
-#if defined(UWLCM_TIMING)
-      tbeg = parent_t::clock::now();
-#endif
-      ftr.get();
-#if defined(UWLCM_TIMING)
-      tend = parent_t::clock::now();
-      parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
-#endif
-    } else assert(!ftr.valid()); 
-#endif
-  }
-  this->mem->barrier();
   parent_t::hook_ante_step(); // includes RHS, which in turn launches sync_in and step_cond
   negcheck(this->mem->advectee(ix::rv)(this->ijk), "rv after at the end of hook_ante_step");
 }
diff --git a/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp b/src/solvers/lgrngn/hook_mixed_rhs_ante_step_lgrngn.hpp
@@ -1,5 +1,6 @@
 #pragma once
 #include "../slvr_lgrngn.hpp"
+#include "../../detail/func_time.hpp"
 #if defined(STD_FUTURE_WORKS)
 #  include <future>
 #endif
@@ -33,6 +34,29 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
       Cz.reindex(this->zero) /= (*params.rhod)(this->vert_idx); // TODO: should be interpolated, since theres a shift between positions of rhod and Cz
     }
 
+    // assuring previous async step finished ...
+#if defined(STD_FUTURE_WORKS)
+    if (
+      params.async && 
+      this->timestep != 0 && // ... but not in first timestep ...
+      ((this->timestep ) % this->outfreq != 0) // ... and not after diag call, note: timestep is updated after ante_step
+    ) {
+      assert(ftr.valid());
+#if defined(UWLCM_TIMING)
+      tbeg = parent_t::clock::now();
+#endif
+#if defined(UWLCM_TIMING)
+      parent_t::tasync_gpu += ftr.get();
+#else
+      ftr.get();
+#endif
+#if defined(UWLCM_TIMING)
+      tend = parent_t::clock::now();
+      parent_t::tasync_wait += std::chrono::duration_cast<std::chrono::milliseconds>( tend - tbeg );
+#endif
+    } else assert(!ftr.valid()); 
+#endif
+
     // start synchronous stuff timer
 #if defined(UWLCM_TIMING)
     tbeg = parent_t::clock::now();
@@ -62,8 +86,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
     {
       assert(!ftr.valid());
       if(params.backend == CUDA)
-        ftr = std::async(
-          std::launch::async, 
+        ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
           &particles_t<real_t, CUDA>::step_cond, 
           dynamic_cast<particles_t<real_t, CUDA>*>(prtcls.get()),
           params.cloudph_opts,
@@ -72,8 +95,7 @@ void slvr_lgrngn<ct_params_t>::hook_mixed_rhs_ante_step()
           std::map<enum libcloudphxx::common::chem::chem_species_t, libcloudphxx::lgrngn::arrinfo_t<real_t> >()
         );
       else if(params.backend == multi_CUDA)
-        ftr = std::async(
-          std::launch::async, 
+        ftr = async_timing_launcher<typename parent_t::clock, typename parent_t::timer>(
           &particles_t<real_t, multi_CUDA>::step_cond, 
           dynamic_cast<particles_t<real_t, multi_CUDA>*>(prtcls.get()),
           params.cloudph_opts,

diff --git a/src/solvers/slvr_common.hpp b/src/solvers/slvr_common.hpp
@@ -28,7 +28,7 @@ class slvr_common : public slvr_dim<ct_params_t>
 
   using clock = std::chrono::system_clock; 
   using timer = std::chrono::milliseconds; 
-  timer tsync, tsync_wait, tasync, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
+  timer tsync, tsync_gpu, tsync_wait, tasync, tasync_gpu, tasync_wait, tasync_wait_in_record_all; // timings used in lgrngn solver TODO: move them to slvr_lgrngn
 
   protected:
 #endif