NVIDIA · liqiangxl · Jul 17, 2023 · May 22, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp
diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h
@@ -90,6 +90,10 @@ class MatmulParams : public HeuristicParams {
   //!    C3 C4 D3 D4
   int grid_swizzle_factor = 1;
 
+  //! Unswizzle MMA results in shared memory to get
+  //!  coalesced write to global memory
+  bool has_smem_epilogue = true;
+
   std::string toString() const override {
     std::stringstream ss;
     ss << "\n===== Matmul Parameters ========\n"

diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp
@@ -376,6 +376,17 @@ std::shared_ptr<MatmulParams> getMatmulHeuristics(
   // Disable magic zero for matmul kernels
   params->cparams.enable_magic_zero = false;
 
+  // Disable shared memory epilogue before shared memory reuse is implemented.
+  //  Otherwise, there will be performance regression due to reduced occupancy
+  //  caused by extra shared memory usage.
+  constexpr bool disable_smem_epilogue = false;
+  if (!disable_smem_epilogue) {
+    // Check if we have enough shared memory for epilogue
+    params->has_smem_epilogue = mma_utils::hasEnoughSharedMemoryForEpilogue(
+        params->tile_sizes,
+        params->double_buffer_options.smem_double_buffer_stage);
+  }
+
   if (isDebugDumpEnabled(DebugDumpOption::MatmulChecks)) {
     printMsg(params->toString());
   }

diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 
+#include <ATen/cuda/CUDAContext.h>
 #include <device_lower/utils.h>
 #include <expr_evaluator.h>
 #include <ir/printer.h>
@@ -14,11 +15,37 @@
 #include <scheduler/utils.h>
 #include <variant>
 #include "mma_type.h"
-
 namespace nvfuser {
 
 namespace mma_utils {
 
+bool hasEnoughSharedMemoryForEpilogue(
+    const MatMulTileOptions& gemm_tile,
+    const int smem_double_buffer_stage) {
+  auto properties = at::cuda::getDeviceProperties(
+      c10::Device(c10::DeviceType::CUDA, 0).index());
+  const size_t device_smem_limit = properties->sharedMemPerBlockOptin;
+
+  // see scheduleContiguousVectorLoad
+  const int vector_word = 8;
+  auto warp_dims = gemm_tile.cta_tile / gemm_tile.warp_tile;
+  const int round_to_factor =
+      warp_dims.m * warp_dims.n * warp_dims.k * 32 * vector_word;
+  const int mk = gemm_tile.cta_tile.m * gemm_tile.cta_tile.k;
+  const int nk = gemm_tile.cta_tile.n * gemm_tile.cta_tile.k;
+  const size_t smem_a = (size_t)(ceilDiv(mk, round_to_factor) *
+                                 round_to_factor * smem_double_buffer_stage) *
+      dataTypeSize(DataType::Half);
+  const size_t smem_b = (size_t)(ceilDiv(nk, round_to_factor) *
+                                 round_to_factor * smem_double_buffer_stage) *
+      dataTypeSize(DataType::Half);
+  const size_t smem_c = (size_t)(gemm_tile.cta_tile.m * gemm_tile.cta_tile.n) *
+      dataTypeSize(DataType::Float);
+  const size_t smem_size = smem_a + smem_b + smem_c;
+
+  return smem_size <= device_smem_limit;
+}
+
 void scheduleWarpTileWithReduction(TensorView* tv, MatMulTileOptions tile) {
   // Assumes
   // [M, N, K]
@@ -379,11 +406,6 @@ bool canValidateIsInnerDim(
       if (!split->factor()->isConstInt()) {
         return false;
       }
-      if (split->factor()->evaluateInt() < inner_dim_size) {
-        // This might be too restrictive. Would need more
-        //   bookkeeping to relax.
-        return false;
-      }
       leaf = split->in();
     } else if (auto merge = dynamic_cast<Merge*>(expr)) {
       // Might consider just rejecting merge.
@@ -396,9 +418,6 @@ bool canValidateIsInnerDim(
       if (!leaf->extent()->isConstInt()) {
         return false;
       }
-      if (leaf->extent()->evaluateInt() != inner_dim_size) {
-        return false;
-      }
       leaf = merge->inner();
     } else {
       // No support for swizzled inner dim for now.
@@ -438,7 +457,9 @@ void checkDimSize(
         ":",
         id->extent()->evaluateInt(),
         "vs",
-        expect[axis_index]);
+        expect[axis_index],
+        "\n for tv: ",
+        tv->toString());
   }
 }
 
@@ -699,6 +720,13 @@ void validateMmaRootInnerMNK(
 //!  swizzles to the right axes.
 //! This check will be relaxed as we build out the mma usage patterns.
 void validateMmaRootInnerMN(TensorView* tv, MmaOptions options, int m, int n) {
+  auto is_mma_output =
+      tv->definition() != nullptr && tv->definition()->isA<MmaOp>();
+  // This function is also used to transform epilogue tensor. It is not a mma
+  // output and can skip the following checks.
+  if (!is_mma_output) {
+    return;
+  }
   auto mma = options.mmaOp();
   auto m_dims = getMmaRootDimensions(tv, mma, MmaDimension::M);
   auto n_dims = getMmaRootDimensions(tv, mma, MmaDimension::N);

diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h
@@ -17,6 +17,11 @@ namespace nvfuser {
 
 namespace mma_utils {
 
+//! Check if there is enough shared memory for the given tile options
+TORCH_CUDA_CU_API bool hasEnoughSharedMemoryForEpilogue(
+    const MatMulTileOptions& gemm_tile,
+    const int smem_double_buffer_stage);
+
 //! Utilities in this namespace facilitates scheduling matmul kernels with
 //!  hierarchichal tiling specified in MatMulTileOptions.
 

diff --git a/csrc/transform_replay.cpp b/csrc/transform_replay.cpp
@@ -333,7 +333,7 @@ std::pair<TensorDomain*, size_t> TransformReplay::replayPasC(
       consumer,
       (int)consumer_pos,
       root_map,
-      false,
+      opt.skip_target_swizzle,
       !opt.replay_swizzle,
       !opt.replay_resize);
 
@@ -609,7 +609,7 @@ std::pair<TensorDomain*, size_t> TransformReplay::replayCasP(
       producer,
       (int)producer_pos,
       root_map,
-      false,
+      opt.skip_target_swizzle,
       !opt.replay_swizzle,
       !opt.replay_resize);
 
@@ -1085,7 +1085,8 @@ void TransformPropagator::propagateC2P(TensorView* from, TensorView* to) {
     std::cout << "  to: " << to << std::endl;
   }
   if (new_pos < 0) {
-    auto replay = TransformReplay::replayPasC(to, from, pos);
+    auto replay = TransformReplay::replayPasC(
+        to, from, pos, TransformReplayOptions().skipTargetSwizzle());
     TORCH_INTERNAL_ASSERT(
         validateDomain(to, replay.first),
         "Tried to set the domain of ",
@@ -1116,7 +1117,8 @@ void TransformPropagator::propagateP2C(TensorView* from, TensorView* to) {
     std::cout << "  to: " << to << std::endl;
   }
   if (new_pos < 0) {
-    auto replay = TransformReplay::replayCasP(to, from, pos);
+    auto replay = TransformReplay::replayCasP(
+        to, from, pos, TransformReplayOptions().skipTargetSwizzle());
     TORCH_INTERNAL_ASSERT(
         validateDomain(to, replay.first),
         "Tried to set the domain of ",
@@ -1187,7 +1189,8 @@ void MostInlinedTransformPropagator::propagateC2P(
     std::cout << "  to: " << to << std::endl;
   }
   if (new_pos < 0) {
-    auto replay = TransformReplay::replayPasC(to, from, pos);
+    auto replay = TransformReplay::replayPasC(
+        to, from, pos, TransformReplayOptions().skipTargetSwizzle());
     TORCH_INTERNAL_ASSERT(
         validateDomain(to, replay.first),
         "Tried to set the domain of ",
@@ -1218,7 +1221,8 @@ void MostInlinedTransformPropagator::propagateP2C(
     std::cout << "  to: " << to << std::endl;
   }
   if (new_pos < 0) {
-    auto replay = TransformReplay::replayCasP(to, from, pos);
+    auto replay = TransformReplay::replayCasP(
+        to, from, pos, TransformReplayOptions().skipTargetSwizzle());
     TORCH_INTERNAL_ASSERT(
         validateDomain(to, replay.first),
         "Tried to set the domain of ",

diff --git a/csrc/transform_replay.h b/csrc/transform_replay.h
@@ -130,10 +130,41 @@ class TensorView;
 class RootDomainMap;
 
 struct TransformReplayOptions {
+  // In theory, it makes more sense to have skip_target_swizzle = true by
+  // default because this is how we index into the producer and how we propagate
+  // transformations. However, we are in a very funny situation that:
+  // BestEffortReplay for swizzle is broken. For example, if we have a
+  // producer <=> consumer pair like:
+  //       I1             I0
+  //      /  \           /  |
+  //     I1o I1i        I0o I0i
+  //      |   |          |   |
+  // swizzle I1i    swizzle I0i     <=>     I3  I2
+  //      |   |          |   |
+  //    I1o' I1i       I0o' I0i
+  //      \  /           \  /
+  //       I1'            I0'
+  // where I1o', I0o' = swizzle(I1o, I0o), we never really skipped swizzle to
+  // map I1' with I3 and I0' with I2. But even with this error, our swizzle
+  // indexing worked due to luck. So effectively we were doing
+  // skip_target_swizzle = false. But today, we can not make this `true` for
+  // vectorization validation and indexing, because of another bug in
+  // BestEffortReplay: swizzle skip should happen in an all-or-nothing fashion.
+  // We can not just skip X but not skip Y, but we are not implementing this
+  // skip like that. If we make it `true`, this will trigger some error in some
+  // schedule. So here, in order to avoid exposing one bug, we are more
+  // explicitly using a wrong behavior that we have been using because this
+  // wrong behavior has a better luck.
+  bool skip_target_swizzle = false;
   bool replay_swizzle = false;
   bool replay_resize = false;
   bool replay_allocation = false;
 
+  TransformReplayOptions& skipTargetSwizzle(bool value = true) {
+    skip_target_swizzle = value;
+    return *this;
+  }
+
   TransformReplayOptions& replaySwizzle(bool value = true) {
     replay_swizzle = value;
     return *this;