Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

combined inner outer reduction, add a simple test case #2400

Open
wants to merge 14 commits into
base: devel
Choose a base branch
from
1 change: 1 addition & 0 deletions third_party/nvfuser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ if(BUILD_TEST)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_combined_inner_outer_reduction.cpp)

set(JIT_TEST_CU_SRCS)
list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
Expand Down
25 changes: 5 additions & 20 deletions third_party/nvfuser/csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -967,26 +967,11 @@ c10::optional<int> getMaxRegCount(
// If the block size is known, set the maximum that at least allows
// one block to be resident on an SM
if (opt_block_size.has_value() && opt_block_size.value() > 0) {
int num_partition = 0;
int reg_allocation_granularity = 0;
const auto prop = at::cuda::getCurrentDeviceProperties();
cudaOccDeviceProp occ_prop(*prop);
cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
int warp_size = prop->warpSize;
int num_warps = ceilDiv(opt_block_size.value(), warp_size);

// warps could be distributed unevenly across partition
int max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
// registers are evenly distributed across partitions, partition with most
// wraps determins the maximum register available per warp
int max_reg_per_warp =
prop->regsPerBlock / num_partition / max_warps_per_sm_partition;
// clamp down to register allocation granularity at warp level
int effective_max_reg_per_warp = max_reg_per_warp /
reg_allocation_granularity * reg_allocation_granularity;
max_register =
std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
constexpr int block_per_sm = 1;
max_register = std::min(
max_register_limit,
(int)getRegPerThreadGivenThreadsPerSM(
opt_block_size.value() * block_per_sm));
}

// If a heuristic value is given, i.e., max_register_heuristic is
Expand Down
4 changes: 4 additions & 0 deletions third_party/nvfuser/csrc/ir_internal_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1486,6 +1486,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
return getIterType() == IterType::Reduction;
}

bool isIteration() const {
return getIterType() == IterType::Iteration;
}

bool isRFactorProduct() const {
return is_rfactor_domain_;
}
Expand Down
3 changes: 2 additions & 1 deletion third_party/nvfuser/csrc/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
}

auto& executor = executors_[group_id];
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
measure_kernel_time_) {
executor.setMeasureKernelTimeFlag(true);
}

Expand Down
5 changes: 5 additions & 0 deletions third_party/nvfuser/csrc/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
profiling_ = to_profile;
}

void setMeasureKernelTime(bool val = true) {
measure_kernel_time_ = val;
}

//! Internal knob for profiling shape inference
void disableLaunchParamCache() {
for (auto& executor : executors_) {
Expand Down Expand Up @@ -190,6 +194,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {

// States for profiling support
bool profiling_ = false;
bool measure_kernel_time_ = false;

std::mutex mutex_;
// TODO: remove `compiling_` mutex and rely on `mutex_` only.
Expand Down
20 changes: 20 additions & 0 deletions third_party/nvfuser/csrc/maxinfo_propagator.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,4 +273,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
}
};

// Simple selector to allow different parallel patterns in the fusion.
// The propagation is blocked at boundaryNodesSet.
// For P2C forward propagate, disable propagation to tensorViews in
// boundaryNodesSet. For C2P backward propagate, disable propagation from
// tensorViews in boundaryNodesSet
struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
std::unordered_set<TensorView*> tvs_;
virtual bool allowC2P(TensorView* from, TensorView* to) override {
return tvs_.count(from) == 0;
};
virtual bool allowP2C(TensorView* from, TensorView* to) override {
return tvs_.count(to) == 0;
};
virtual bool allowSibling(TensorView* from, TensorView* to) override {
return true;
}
InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
: tvs_(tvs) {}
};

} // namespace nvfuser
Loading