Skip to content

Commit

Permalink
Fixing and improving indexing type handling (#2522)
Browse files Browse the repository at this point in the history
Fixing and improving indexing type handling
  • Loading branch information
mmigdal-nv authored Mar 8, 2023
1 parent a86f9b0 commit 3b85308
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 51 deletions.
109 changes: 61 additions & 48 deletions third_party/nvfuser/csrc/executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,7 @@ void FusionExecutor::compileFusion(
// TODO: refactor the options_ passed through
options_.device = c10::Device(c10::DeviceType::CUDA, args.getDeviceIndex());
options_.index_mode = args.getIndexMode();
compile_params.index_type =
(options_.index_mode == KernelIndexMode::INT64 ? DataType::Int
: DataType::Int32);
compile_params.index_type = DataType::Index;
c10::DeviceGuard dg(options_.device);

TORCH_INTERNAL_ASSERT(
Expand Down Expand Up @@ -1104,11 +1102,68 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
launch_params_ =
computeLaunchParams(launch_constraints, expr_eval, warp_size_);

auto alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::InputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::vector<std::pair<int, int>>>(
fusion_->getInputAliasIndices());
});

auto& alias_indices = alias_indices_entry.get();

// We need to push all of the outputs to the KernelArgumentHolder before
// checking at the Indexing Type. NOLINTNEXTLINE(bugprone-branch-clone)
if (outputs.empty()) {
auto output_alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::OutputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::unordered_set<int>>(
fusion_->getOutputAliasIndices());
});

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
auto aliased_input_index = entry.second;
auto tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract, "alias io only supports tensor");
allocated_outputs[aliased_output_index] =
tensor_arg_abstract->getTensor();
}
args.push(allocated_outputs);
} else {
allocated_outputs = outputs;
args.push(outputs);
executor_utils::validateKernelOutputs(
fusion_, allocated_outputs, options_.device);
}

// Recompile the kernel if the number of threads in the block has increased
// or maxrregcount has changed
if (launch_params_.nThreads() > block_size_high_water_mark ||
compile_params.maxrregcount != maxrregcount_high_water_mark) {
// or maxrregcount has changed or nvfuser_index_t size changed
bool need_to_recompile =
launch_params_.nThreads() > block_size_high_water_mark ||
compile_params.maxrregcount != maxrregcount_high_water_mark;

// we recompile if the index type shrinks too, as it may lead to faster
// code.
if (args.getIndexMode() != options_.index_mode) {
need_to_recompile = true;
options_.index_mode = args.getIndexMode();
}

if (need_to_recompile) {
const auto kernel = lowered_->kernel();
// index type is contained in the kernel name and as a result in
// kernel_code which can then be used as a key in KernelDb. TODO This
// needs to be cleaned-up so that KernelDb's key is not only the kernel
// (we also need the GPU Arch, maxrregcount, ...)
kernel_code_ = codegen::generateCudaKernel(kernel, kernelName());
const auto structured_code = getStructuredCode(kernel_code_);
block_size_high_water_mark = launch_params_.nThreads();
Expand Down Expand Up @@ -1163,48 +1218,6 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
compileTimeDataCache(),
expr_eval);

auto alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::InputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::vector<std::pair<int, int>>>(
fusion_->getInputAliasIndices());
});

auto& alias_indices = alias_indices_entry.get();

// NOLINTNEXTLINE(bugprone-branch-clone)
if (outputs.empty()) {
auto output_alias_indices_entry =
executor_utils::caching::ExecutorCompileTimeEntry<
executor_utils::caching::OutputAliasIndices>(
compileTimeDataCache(), [&]() {
return std::make_unique<std::unordered_set<int>>(
fusion_->getOutputAliasIndices());
});

auto& output_alias_indices = output_alias_indices_entry.get();

allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);

for (const auto& entry : alias_indices) {
auto aliased_output_index = entry.first;
auto aliased_input_index = entry.second;
auto tensor_arg_abstract =
dynamic_cast<const TensorArgAbstract*>(args[aliased_input_index]);
TORCH_INTERNAL_ASSERT(
tensor_arg_abstract, "alias io only supports tensor");
allocated_outputs[aliased_output_index] =
tensor_arg_abstract->getTensor();
}
args.push(allocated_outputs);
} else {
allocated_outputs = outputs;
args.push(outputs);
executor_utils::validateKernelOutputs(
fusion_, allocated_outputs, options_.device);
}

global_buffers = allocGlobalVals(expr_eval);

if (kernel()->summary().max_rng_offsets >= 0) {
Expand Down
9 changes: 8 additions & 1 deletion third_party/nvfuser/csrc/executor.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,16 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
last_compiled_binary_, "-fun 1 -c");
}

//! Returns a kernel name.
//! The kernel name has a suffix that represents the choosen indexing mode:
//! int32 or int64.
//! TODO: the suffix is currently used principally as a workaround so we can
//! use the kernel string as a key in KernelDb. This should be re-worked.
std::string kernelName() const {
const char* index_type_suffix =
options_.index_mode == KernelIndexMode::INT64 ? "_int64" : "_int32";
std::stringstream ss;
ss << "kernel" << fusion_id_;
ss << "kernel" << fusion_id_ << index_type_suffix;
return ss.str();
}

Expand Down
46 changes: 46 additions & 0 deletions third_party/nvfuser/csrc/executor_kernel_arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,14 @@ KernelArgumentHolder KernelArgumentHolder::createKernelArgumentHolder(

// Push a tensor to the arguments
void KernelArgumentHolder::push(const at::Tensor& tensor) {
// Deferred update of index_mode.
// Index mode size depends on the fusion inputs AND outputs.
// TODO We should also take into account the other intermediary buffers
if (index_mode_ == KernelIndexMode::INT32 &&
collectIndexMode({tensor}) == KernelIndexMode::INT64) {
setIndexMode(KernelIndexMode::INT64);
}

changed_ = true;
if (is_cpu_scalar(tensor)) {
switch (tensor.scalar_type()) {
Expand Down Expand Up @@ -283,6 +291,44 @@ void** KernelArgumentHolder::getBuffer() {
return void_ptrs_.data();
}

void KernelArgumentHolder::updateTensorIndexModes() {
for (auto& arg : arguments_) {
TensorArgAbstract* tensor_arg_old =
dynamic_cast<TensorArgAbstract*>(arg.get());
if (tensor_arg_old == nullptr)
continue;
auto tensor = tensor_arg_old->getTensor();
int nDims = tensor.ndimension();
c10::ScalarType dtype = tensor.scalar_type();
std::unique_ptr<TensorArgAbstract> tensor_arg =
getTensorArg(dtype, nDims, index_mode_);
tensor_arg->setTensor(tensor);
tensor_arg->setPointer(tensor.data_ptr());
tensor_arg->setDataType(aten_to_data_type(dtype));
for (const auto i : c10::irange(nDims)) {
tensor_arg->setSize(i, tensor.sizes()[i]);
tensor_arg->setStride(i, tensor.strides()[i]);
}
arg = std::move(tensor_arg);
}
}

KernelIndexMode KernelArgumentHolder::getSmallestIndexModeRequired() const {
KernelIndexMode smallest = KernelIndexMode::INT32;
for (auto& arg : arguments_) {
TensorArgAbstract* tensor_arg_old =
dynamic_cast<TensorArgAbstract*>(arg.get());
if (tensor_arg_old == nullptr)
continue;
auto tensor = tensor_arg_old->getTensor();
auto mode = collectIndexMode({tensor});
if (mode == KernelIndexMode::INT64) {
smallest = KernelIndexMode::INT64;
}
}
return smallest;
}

void KernelArgumentHolder::push(const c10::ArrayRef<c10::IValue>& args) {
// Naive I/O setup, I'm ignoring all the potential transformation (i.e. I/O
// allocated here from the subgraph could be, and very likely are, different
Expand Down
37 changes: 37 additions & 0 deletions third_party/nvfuser/csrc/executor_kernel_arg.h
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,37 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
return index_mode_;
}

// Allows the user to specify a larger index mode than reqd
void setIndexMode(KernelIndexMode mode) {
// index_mode_ is deduced on args. We won't allow setting it to a smaller
// size than required
auto smallest = getSmallestIndexModeRequired();

// well, turns out index type was too small
if (smallest == KernelIndexMode::INT64 &&
index_mode_ == KernelIndexMode::INT32) {
TORCH_INTERNAL_ASSERT(
false,
"index_mode_ is INT32, but given the state of KernelArgumentHolder it should have already been INT64.");
}

// we can't narrow down index type
if (mode == KernelIndexMode::INT32 && smallest == KernelIndexMode::INT64) {
return;
}

// no-op
if (mode == index_mode_) {
return;
}

// Increasing or decreasing index size.
if (mode == smallest || mode == KernelIndexMode::INT64) {
index_mode_ = mode;
updateTensorIndexModes();
}
}

explicit KernelArgumentHolder(KernelIndexMode index_mode)
: index_mode_(index_mode) {}

Expand Down Expand Up @@ -332,6 +363,12 @@ class TORCH_CUDA_CU_API KernelArgumentHolder {
return arguments_.back().get();
}

// Goes through all tensors and changes index mode
void updateTensorIndexModes();

// Checks all tensors held to find the smallest index mode required.
KernelIndexMode getSmallestIndexModeRequired() const;

void appendPhiloxRNGSeed(uint64_t rand_offset);

const ArgAbstract* operator[](int ind) const {
Expand Down
4 changes: 2 additions & 2 deletions third_party/nvfuser/csrc/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,11 @@ KernelIndexMode collectIndexMode(const at::ArrayRef<c10::IValue>& inputs) {
if (tensor_input.stride(dim_i) > 0) {
// Acuumulate positive stride
tensor_most_positive_index +=
(tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
tensor_input.size(dim_i) * tensor_input.stride(dim_i);
} else {
// Acuumulate negative stride
tensor_most_negative_index +=
(tensor_input.size(dim_i) - 1) * tensor_input.stride(dim_i);
tensor_input.size(dim_i) * tensor_input.stride(dim_i);
}
}
}
Expand Down
56 changes: 56 additions & 0 deletions third_party/nvfuser/test/test_gpu3.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7750,6 +7750,62 @@ TEST_F(NVFuserTest, FusionPredicateReductionInitGlobal_CUDA) {
fe.kernel(), cg_outputs, inputs, {ref_t1, ref_t3}, __LINE__, __FILE__);
}

TEST_F(NVFuserTest, FusionExecutorCache_IndexType_Recompilation) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = makeContigTensor(2, DataType::Half);
auto tv1 = makeContigTensor(2, DataType::Half);
auto tv2 = add(tv0, tv1);
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addOutput(tv2);

auto fec = std::make_unique<FusionExecutorCache>(std::move(fusion));

auto options = at::TensorOptions().dtype(at::kHalf).device(at::kCUDA, 0);
auto input_small = at::ones({1024, 1024}, options);
auto input_large = at::ones({1024, 4 * 1 << 20}, options);

KernelArgumentHolder small_holder =
KernelArgumentHolder::createKernelArgumentHolder({input_small});
TORCH_CHECK(
small_holder.getIndexMode() ==
KernelIndexMode::INT32); // Index mode should be INT32

small_holder.setIndexMode(
KernelIndexMode::INT64); // Index mode should go up in size.
TORCH_CHECK(small_holder.getIndexMode() == KernelIndexMode::INT64);

small_holder.push(input_large);
small_holder.setIndexMode(KernelIndexMode::INT32);
TORCH_CHECK(
small_holder.getIndexMode() ==
KernelIndexMode::INT64); // Index mode shouldn't be changed.

// first run with the smaller input
fec->runFusionWithInputs({input_small, input_small});
auto kernel_runtime_for_input_small = fec->getMostRecentKernelRuntime();

// run with the larger input
fec->runFusionWithInputs({input_large, input_large});
auto kernel_runtime_for_input_large = fec->getMostRecentKernelRuntime();

// make sure they correspond to different runtimes, just in case
TORCH_CHECK(
kernel_runtime_for_input_small != kernel_runtime_for_input_large,
"Unexpected kernel runtime");

// second run with the smaller input
fec->runFusionWithInputs({input_small, input_small});
auto second_kernel_runtime_for_input_small =
fec->getMostRecentKernelRuntime();

TORCH_CHECK(
kernel_runtime_for_input_small == second_kernel_runtime_for_input_small,
"Expected to reuse the same runtime");
}

// Test file size should be up to 10K LoC. Create a new file for more tests.

} // namespace nvfuser

0 comments on commit 3b85308

Please sign in to comment.