csarofeen · liqiangxl · Feb 1, 2023 · Mar 31, 2023 · Apr 3, 2023 · Apr 4, 2023
diff --git a/third_party/nvfuser/csrc/scheduler/normalization.cpp b/third_party/nvfuser/csrc/scheduler/normalization.cpp
@@ -1828,16 +1828,26 @@ void schedulePersistentKernelInnerOuter(
   // directly from output tv using parallelizeAllLike. must propagate seperaely
   // for different tvs as outer reductions are transformed seperately.
   if (rparams.vectorization_factor_outer > 1) {
+    auto findVectorizedOutputOf = [&](TensorView* tv) {
+      TensorView* ref_tv = nullptr;
+      for (auto output_tv : ir_utils::outputTvsOf(tv)) {
+        for (auto id : output_tv->domain()->domain()) {
+          if (id->getParallelType() == ParallelType::Vectorize) {
+            ref_tv = output_tv;
+            break;
+          }
+        }
+        if (ref_tv) {
+          break;
+        }
+      }
+      return ref_tv;
+    };
     for (auto tv : cached_gmem_reload) {
-      auto output_tvs = ir_utils::outputTvsOf(tv);
-      TORCH_INTERNAL_ASSERT(
-          !output_tvs.empty(),
-          "cached_gmem_reload should have at least one output tensor.")
-      scheduler_utils::parallelizeAllLike(
-          output_tvs[0],
-          -1,
-          {cached_gmem_reload.begin(), cached_gmem_reload.end()},
-          {ParallelType::Vectorize});
+      if (auto ref_tv = findVectorizedOutputOf(tv)) {
+        scheduler_utils::parallelizeAllLike(
+            ref_tv, -1, {tv}, {ParallelType::Vectorize});
+      }
     }
   }