NVIDIA · liqiangxl · Jul 17, 2023 · May 22, 2023 · Jun 8, 2023 · Jun 8, 2023
diff --git a/csrc/scheduler/matmul.cpp b/csrc/scheduler/matmul.cpp
diff --git a/csrc/scheduler/matmul_heuristic.h b/csrc/scheduler/matmul_heuristic.h
@@ -90,6 +90,10 @@ class MatmulParams : public HeuristicParams {
   //!    C3 C4 D3 D4
   int grid_swizzle_factor = 1;
 
+  //! Unswizzle MMA results in shared memory to get
+  //!  coalesced write to global memory
+  bool has_smem_epilogue = true;
+
   std::string toString() const override {
     std::stringstream ss;
     ss << "\n===== Matmul Parameters ========\n"
@@ -112,6 +116,7 @@ class MatmulParams : public HeuristicParams {
                                                            : "column-major")
        << "\n"
        << "Grid swizzle factor: " << grid_swizzle_factor << "\n"
+       << "Use shared memory epilogue: " << has_smem_epilogue << "\n"
        << "====================================\n";
     return ss.str();
   }

diff --git a/csrc/scheduler/matmul_utils.cpp b/csrc/scheduler/matmul_utils.cpp
@@ -150,6 +150,23 @@ inline bool initExtraHeuristics(
   return true;
 }
 
+//! A wrapper to get MMA Tensor data types
+//!   The order of returned types: INPUT_A, INPUT_B, OUTPUT_D
+inline mma_utils::MmaDataTypes getMmaDataTypes(
+    const std::map<MatmulRole, std::vector<TensorView*>>& roles_map) {
+  auto getMMADataType = [&](MatmulRole role) {
+    auto entry = roles_map.find(role);
+    if (entry != roles_map.end() && !entry->second.empty()) {
+      return entry->second.front()->dtype();
+    }
+    TORCH_INTERNAL_ASSERT(false, "Get MMA Tensor data type failed!");
+  };
+  const auto a_type = getMMADataType(MatmulRole::INPUT_A);
+  const auto b_type = getMMADataType(MatmulRole::INPUT_B);
+  const auto c_type = getMMADataType(MatmulRole::OUTPUT_D);
+  return mma_utils::MmaDataTypes{a_type, b_type, c_type};
+}
+
 //! A helper for getting problem shape from fusion and runtime info.
 ProblemShape getProblemShape(
     Fusion* fusion,
@@ -398,6 +415,23 @@ std::shared_ptr<MatmulParams> getMatmulHeuristics(
   // Disable magic zero for matmul kernels
   params->cparams.enable_magic_zero = false;
 
+  // Disable shared memory epilogue before shared memory reuse is implemented.
+  //  Otherwise, there will be performance regression due to reduced occupancy
+  //  caused by extra shared memory usage.
+  constexpr bool allow_smem_epilogue = true;
+  if (allow_smem_epilogue) {
+    const auto& roles_map_opt = mma_utils::getTensorsRoles(fusion);
+    TORCH_INTERNAL_ASSERT(
+        roles_map_opt.isValid(), "Tensor roles map in mma is not valid.");
+    // Check if we have enough shared memory for epilogue
+    params->has_smem_epilogue = mma_utils::hasEnoughSharedMemoryForEpilogue(
+        params->tile_sizes,
+        params->double_buffer_options.smem_double_buffer_stage,
+        getMmaDataTypes(roles_map_opt.getData()));
+  } else {
+    params->has_smem_epilogue = false;
+  }
+
   if (isDebugDumpEnabled(DebugDumpOption::MatmulChecks)) {
     printMsg(params->toString());
   }

diff --git a/csrc/scheduler/mma_utils.cpp b/csrc/scheduler/mma_utils.cpp
@@ -6,6 +6,7 @@
  */
 // clang-format on
 
+#include <ATen/cuda/CUDAContext.h>
 #include <device_lower/utils.h>
 #include <expr_evaluator.h>
 #include <ir/printer.h>
@@ -14,11 +15,49 @@
 #include <scheduler/utils.h>
 #include <variant>
 #include "mma_type.h"
-
 namespace nvfuser {
 
 namespace mma_utils {
 
+bool hasEnoughSharedMemoryForEpilogue(
+    const MatMulTileOptions& gemm_tile,
+    const int smem_double_buffer_stage,
+    const MmaDataTypes& data_types) {
+  const auto properties = at::cuda::getCurrentDeviceProperties();
+  const size_t device_smem_limit = properties->sharedMemPerBlockOptin;
+
+  auto warp_dims = gemm_tile.cta_tile / gemm_tile.warp_tile;
+  const auto threads_per_block =
+      warp_dims.m * warp_dims.n * warp_dims.k * properties->warpSize;
+  // a thread can use up to 255 registers, blocks per sm is limited by available
+  // registers
+  const auto threads_per_sm = getThreadsPerSMGivenRegPerThread(255);
+  const auto blocks_per_sm = threads_per_sm / threads_per_block;
+  // see scheduleContiguousVectorLoad
+  const int vector_word = 8;
+  const int round_to_factor = warp_dims.m * warp_dims.n * warp_dims.k *
+      properties->warpSize * vector_word;
+  const int mk = gemm_tile.cta_tile.m * gemm_tile.cta_tile.k;
+  const int nk = gemm_tile.cta_tile.n * gemm_tile.cta_tile.k;
+  const size_t smem_a = (size_t)(ceilDiv(mk, round_to_factor) *
+                                 round_to_factor * smem_double_buffer_stage) *
+      dataTypeSize(data_types[0]);
+  const size_t smem_b = (size_t)(ceilDiv(nk, round_to_factor) *
+                                 round_to_factor * smem_double_buffer_stage) *
+      dataTypeSize(data_types[1]);
+  const size_t smem_c = (size_t)(gemm_tile.cta_tile.m * gemm_tile.cta_tile.n) *
+      dataTypeSize(data_types[2]);
+
+  // use additional shared memory for epilogue if blocks per sm is not changed
+  const auto blocks_per_sm_without_smem_epilogue =
+      std::min(device_smem_limit / (smem_a + smem_b), (size_t)blocks_per_sm);
+  const auto blocks_per_sm_with_smem_epilogue = std::min(
+      device_smem_limit / (smem_a + smem_b + smem_c), (size_t)blocks_per_sm);
+  return blocks_per_sm_with_smem_epilogue ==
+      blocks_per_sm_without_smem_epilogue ||
+      blocks_per_sm_with_smem_epilogue > 0;
+}
+
 void scheduleWarpTileWithReduction(TensorView* tv, MatMulTileOptions tile) {
   // Assumes
   // [M, N, K]
@@ -379,11 +418,6 @@ bool canValidateIsInnerDim(
       if (!split->factor()->isConstInt()) {
         return false;
       }
-      if (split->factor()->evaluateInt() < inner_dim_size) {
-        // This might be too restrictive. Would need more
-        //   bookkeeping to relax.
-        return false;
-      }
       leaf = split->in();
     } else if (auto merge = dynamic_cast<Merge*>(expr)) {
       // Might consider just rejecting merge.
@@ -396,9 +430,6 @@ bool canValidateIsInnerDim(
       if (!leaf->extent()->isConstInt()) {
         return false;
       }
-      if (leaf->extent()->evaluateInt() != inner_dim_size) {
-        return false;
-      }
       leaf = merge->inner();
     } else {
       // No support for swizzled inner dim for now.
@@ -438,7 +469,9 @@ void checkDimSize(
         ":",
         id->extent()->evaluateInt(),
         "vs",
-        expect[axis_index]);
+        expect[axis_index],
+        "\n for tv: ",
+        tv->toString());
   }
 }
 
@@ -699,6 +732,13 @@ void validateMmaRootInnerMNK(
 //!  swizzles to the right axes.
 //! This check will be relaxed as we build out the mma usage patterns.
 void validateMmaRootInnerMN(TensorView* tv, MmaOptions options, int m, int n) {
+  auto is_mma_output =
+      tv->definition() != nullptr && tv->definition()->isA<MmaOp>();
+  // This function is also used to transform epilogue tensor. It is not a mma
+  // output and can skip the following checks.
+  if (!is_mma_output) {
+    return;
+  }
   auto mma = options.mmaOp();
   auto m_dims = getMmaRootDimensions(tv, mma, MmaDimension::M);
   auto n_dims = getMmaRootDimensions(tv, mma, MmaDimension::N);

diff --git a/csrc/scheduler/mma_utils.h b/csrc/scheduler/mma_utils.h
@@ -226,6 +226,10 @@ using ProblemIterDomains = std::array<IterDomain*, 3>;
 //!  a single tv, for example input for beta scaling in epilogue
 using RolesMap = std::map<MatmulRole, std::vector<TensorView*>>;
 
+//! An alias for storing data types of the tensors in the mma op
+//!  the order is INPUT_A, INPUT_B, OUTPUT_D
+using MmaDataTypes = std::array<DataType, 3>;
+
 //! A wrapper for data containers with optional error message stored if
 //!  initialization of the data fails.
 template <typename DataType>
@@ -289,6 +293,12 @@ TORCH_CUDA_CU_API ProblemIterDomainsOpt getProblemIterDomains(Fusion* fusion);
 //!  be gathered.
 TORCH_CUDA_CU_API RolesMapOpt getTensorsRoles(Fusion* fusion);
 
+//! Check if there is enough shared memory for the given tile options
+TORCH_CUDA_CU_API bool hasEnoughSharedMemoryForEpilogue(
+    const MatMulTileOptions& gemm_tile,
+    const int smem_double_buffer_stage,
+    const MmaDataTypes& data_types);
+
 } // namespace mma_utils
 
 } // namespace nvfuser