csarofeen · liqiangxl · Feb 1, 2023 · Mar 31, 2023 · Apr 3, 2023 · Apr 4, 2023
diff --git a/third_party/nvfuser/CMakeLists.txt b/third_party/nvfuser/CMakeLists.txt
@@ -361,6 +361,7 @@ if(BUILD_TEST)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp)
   list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp)
+  list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_combined_inner_outer_reduction.cpp)
 
   set(JIT_TEST_CU_SRCS)
   list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)

diff --git a/third_party/nvfuser/csrc/executor_utils.cpp b/third_party/nvfuser/csrc/executor_utils.cpp
@@ -967,26 +967,11 @@ c10::optional<int> getMaxRegCount(
   // If the block size is known, set the maximum that at least allows
   // one block to be resident on an SM
   if (opt_block_size.has_value() && opt_block_size.value() > 0) {
-    int num_partition = 0;
-    int reg_allocation_granularity = 0;
-    const auto prop = at::cuda::getCurrentDeviceProperties();
-    cudaOccDeviceProp occ_prop(*prop);
-    cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
-    cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
-    int warp_size = prop->warpSize;
-    int num_warps = ceilDiv(opt_block_size.value(), warp_size);
-
-    // warps could be distributed unevenly across partition
-    int max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
-    // registers are evenly distributed across partitions, partition with most
-    // wraps determins the maximum register available per warp
-    int max_reg_per_warp =
-        prop->regsPerBlock / num_partition / max_warps_per_sm_partition;
-    // clamp down to register allocation granularity at warp level
-    int effective_max_reg_per_warp = max_reg_per_warp /
-        reg_allocation_granularity * reg_allocation_granularity;
-    max_register =
-        std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
+    constexpr int block_per_sm = 1;
+    max_register = std::min(
+        max_register_limit,
+        (int)getRegPerThreadGivenThreadsPerSM(
+            opt_block_size.value() * block_per_sm));
   }
 
   // If a heuristic value is given, i.e., max_register_heuristic is

diff --git a/third_party/nvfuser/csrc/ir_internal_nodes.h b/third_party/nvfuser/csrc/ir_internal_nodes.h
@@ -1486,6 +1486,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
     return getIterType() == IterType::Reduction;
   }
 
+  bool isIteration() const {
+    return getIterType() == IterType::Iteration;
+  }
+
   bool isRFactorProduct() const {
     return is_rfactor_domain_;
   }

diff --git a/third_party/nvfuser/csrc/kernel_cache.cpp b/third_party/nvfuser/csrc/kernel_cache.cpp
@@ -373,7 +373,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
   }
 
   auto& executor = executors_[group_id];
-  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
+  if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
+      measure_kernel_time_) {
     executor.setMeasureKernelTimeFlag(true);
   }
 

diff --git a/third_party/nvfuser/csrc/kernel_cache.h b/third_party/nvfuser/csrc/kernel_cache.h
@@ -87,6 +87,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
     profiling_ = to_profile;
   }
 
+  void setMeasureKernelTime(bool val = true) {
+    measure_kernel_time_ = val;
+  }
+
   //! Internal knob for profiling shape inference
   void disableLaunchParamCache() {
     for (auto& executor : executors_) {
@@ -190,6 +194,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
 
   // States for profiling support
   bool profiling_ = false;
+  bool measure_kernel_time_ = false;
 
   std::mutex mutex_;
   // TODO: remove `compiling_` mutex and rely on `mutex_` only.

diff --git a/third_party/nvfuser/csrc/maxinfo_propagator.h b/third_party/nvfuser/csrc/maxinfo_propagator.h
@@ -273,4 +273,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
   }
 };
 
+// Simple selector to allow different parallel patterns in the fusion.
+// The propagation is blocked at boundaryNodesSet.
+// For P2C forward propagate, disable propagation to tensorViews in
+// boundaryNodesSet. For C2P backward propagate, disable propagation from
+// tensorViews in boundaryNodesSet
+struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
+  std::unordered_set<TensorView*> tvs_;
+  virtual bool allowC2P(TensorView* from, TensorView* to) override {
+    return tvs_.count(from) == 0;
+  };
+  virtual bool allowP2C(TensorView* from, TensorView* to) override {
+    return tvs_.count(to) == 0;
+  };
+  virtual bool allowSibling(TensorView* from, TensorView* to) override {
+    return true;
+  }
+  InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
+      : tvs_(tvs) {}
+};
+
 } // namespace nvfuser