Skip to content

Commit

Permalink
reduce aot build time
Browse files Browse the repository at this point in the history
  • Loading branch information
evetsso authored Nov 4, 2022
1 parent 8dd14d0 commit 979a451
Show file tree
Hide file tree
Showing 15 changed files with 245 additions and 79 deletions.
2 changes: 1 addition & 1 deletion .jenkins/common.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def runTestCommand (platform, project, boolean debug=false)
def command = """#!/usr/bin/env bash
set -x
cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging
ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --gtest_color=yes --R 80 --gtest_filter='-rocfft_UnitTest.repo_twiddle'
ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile --gtest_color=yes --R 80 --gtest_filter='-rocfft_UnitTest.repo_twiddle'
"""
platform.runCommand(this, command)
}
Expand Down
1 change: 0 additions & 1 deletion .jenkins/performance.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ def runCompileCommand(platform, project, jobName, boolean debug=false, boolean b
def runTestCommand (platform, project, boolean debug=false)
{
String sudo = auxiliary.sudo(platform.jenkinsLabel)
String testBinaryName = debug ? 'rocfft-test-d' : 'rocfft-test'
String directory = debug ? 'debug' : 'release'

def dataTypes = ['single', 'double']
Expand Down
3 changes: 2 additions & 1 deletion clients/tests/accuracy_test_adhoc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ std::vector<std::vector<size_t>> adhoc_sizes = {
// or
// L1D_TRTRT (T + L1D_CC + STOCKHAM_BL_CC + STOCHMAM_BL_RC + T + STOCKHAM + T)
// for lengthBlue > 4096^2.
{196597, 25165813},
{196597},
{25165813},

// TILE_UNALIGNED type of SBRC 3D ERC
{98, 98, 98},
Expand Down
79 changes: 76 additions & 3 deletions clients/tests/gtest_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,17 @@
/// @brief googletest based unit tester for rocfft
///

#include <chrono>
#include <fstream>
#include <gtest/gtest.h>
#include <iostream>
#include <streambuf>
#include <string>
#include <thread>

#include "../../shared/concurrency.h"
#include "../../shared/environment.h"
#include "../../shared/work_queue.h"
#include "../rocfft_params.h"
#include "rocfft.h"
#include "rocfft_accuracy_test.h"
Expand Down Expand Up @@ -85,6 +90,70 @@ static size_t get_system_memory_GiB()
#endif
}

void precompile_test_kernels()
{
puts("precompiling test kernels...");
WorkQueue<std::string> tokenQueue;

std::vector<std::string> tokens;
auto ut = testing::UnitTest::GetInstance();
for(int ts_index = 0; ts_index < ut->total_test_suite_count(); ++ts_index)
{
const auto ts = ut->GetTestSuite(ts_index);
// skip disabled suites
if(strncmp(ts->name(), "DISABLED", 8) == 0)
continue;
for(int ti_index = 0; ti_index < ts->total_test_count(); ++ti_index)
{
const auto ti = ts->GetTestInfo(ti_index);
std::string name = ti->name();
// only care about accuracy tests
if(name.find("vs_fftw/") != std::string::npos
&& name.find("_batch_1_") != std::string::npos)
{
name.erase(0, 8);
tokens.emplace_back(std::move(name));
}
}
}

std::random_device dev;
std::mt19937 dist(dev());
std::shuffle(tokens.begin(), tokens.end(), dist);
auto precompile_begin = std::chrono::steady_clock::now();
printf("precompiling %zu FFT plans...\n", tokens.size());

for(auto&& t : tokens)
tokenQueue.push(std::move(t));

EnvironmentSetTemp env_twid{"ROCFFT_INTERNAL_COMPILE_ONLY", "1"};
const size_t NUM_THREADS = rocfft_concurrency();
std::vector<std::thread> threads;
for(size_t i = 0; i < NUM_THREADS; ++i)
{
threads.emplace_back([&tokenQueue]() {
for(;;)
{
std::string token{tokenQueue.pop()};
if(token.empty())
break;
rocfft_params params;
params.from_token(token);
params.validate();
params.setup_structs();
}
});
// insert empty tokens to tell threads to stop
tokenQueue.push({});
}
for(auto& t : threads)
t.join();

auto precompile_end = std::chrono::steady_clock::now();
std::chrono::duration<double, std::milli> precompile_ms = precompile_end - precompile_begin;
printf("done precompiling FFT plans in %.2f ms\n", precompile_ms.count());
}

int main(int argc, char* argv[])
{
// NB: If we initialize gtest first, then it removes all of its own command-line
Expand Down Expand Up @@ -167,7 +236,8 @@ int main(int argc, char* argv[])
po::value<std::string>(&fftw_wisdom_filename)->default_value("wisdom3.txt"),
"FFTW3 wisdom filename")
("scalefactor", po::value<double>(&manual_params.scale_factor), "Scale factor to apply to output.")
("token", po::value<std::string>(&test_token)->default_value(""), "Test token name for manual test");
("token", po::value<std::string>(&test_token)->default_value(""), "Test token name for manual test")
("precompile", "Precompile kernels for all test cases before running tests");
// clang-format on

po::variables_map vm;
Expand Down Expand Up @@ -197,8 +267,8 @@ int main(int argc, char* argv[])
#ifdef FFTW_MULTITHREAD
fftw_init_threads();
fftwf_init_threads();
fftw_plan_with_nthreads(std::thread::hardware_concurrency());
fftwf_plan_with_nthreads(std::thread::hardware_concurrency());
fftw_plan_with_nthreads(rocfft_concurrency());
fftwf_plan_with_nthreads(rocfft_concurrency());
#endif

if(use_fftw_wisdom)
Expand Down Expand Up @@ -296,6 +366,9 @@ int main(int argc, char* argv[])
}
}

if(vm.count("precompile"))
precompile_test_kernels();

auto retval = RUN_ALL_TESTS();

if(use_fftw_wisdom)
Expand Down
23 changes: 0 additions & 23 deletions clients/tests/unit_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,29 +158,6 @@ TEST(rocfft_UnitTest, repo_twiddle)
rocfft_plan_destroy(plan_inverse);
}

// RAII object to set an environment variable and restore it to its
// previous value on destruction
struct EnvironmentSetTemp
{
EnvironmentSetTemp(const char* _var, const char* val)
: var(_var)
{
auto val_ptr = rocfft_getenv(_var);
if(!val_ptr.empty())
oldvalue = val_ptr;
rocfft_setenv(_var, val);
}
~EnvironmentSetTemp()
{
if(oldvalue.empty())
rocfft_unsetenv(var.c_str());
else
rocfft_setenv(var.c_str(), oldvalue.c_str());
}
std::string var;
std::string oldvalue;
};

// Check whether logs can be emitted from multiple threads properly
TEST(rocfft_UnitTest, log_multithreading)
{
Expand Down
1 change: 1 addition & 0 deletions library/src/include/node_factory.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class NodeFactory
static const Map1DLength map1DLengthDouble;

static bool Large1DLengthsValid(const Map1DLength& map1DLength, rocfft_precision precision);
static bool CheckLarge1DMaps();

public:
// Create node (user level) using this function
Expand Down
15 changes: 12 additions & 3 deletions library/src/node_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -288,11 +288,21 @@ bool NodeFactory::Large1DLengthsValid(const NodeFactory::Map1DLength& map1DLengt
return true;
}

// helper to check 1d length maps at most once per process
bool NodeFactory::CheckLarge1DMaps()
{
static bool singleValid
= NodeFactory::Large1DLengthsValid(NodeFactory::map1DLengthSingle, rocfft_precision_single);
static bool doubleValid
= NodeFactory::Large1DLengthsValid(NodeFactory::map1DLengthDouble, rocfft_precision_double);
return singleValid && doubleValid;
}

// Checks whether the non-pow2 length input is supported for a Bluestein compute scheme
bool NodeFactory::NonPow2LengthSupported(rocfft_precision precision, size_t length)
{
// Exceptions which have been found to perform poorly when compared to the next pow2 length
std::map<rocfft_precision, std::set<size_t>> length_exceptions
static const std::map<rocfft_precision, std::set<size_t>> length_exceptions
= {{rocfft_precision_single,
{224, 2160, 2430, 2880, 3456, 21504, 21952, 23232, 79860, 95832, 110592}},
{rocfft_precision_double,
Expand All @@ -306,8 +316,7 @@ bool NodeFactory::NonPow2LengthSupported(rocfft_precision precision, size_t leng
if(function_pool::has_function(fpkey(length, precision)))
return true;

assert(Large1DLengthsValid(map1DLengthSingle, precision));
assert(Large1DLengthsValid(map1DLengthDouble, precision));
assert(CheckLarge1DMaps());

// and for supported block CC + RC Stockham decompositions
if(precision == rocfft_precision_single
Expand Down
5 changes: 5 additions & 0 deletions library/src/plan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "plan.h"
#include "../../shared/array_predicate.h"
#include "../../shared/environment.h"
#include "../../shared/ptrdiff.h"
#include "arithmetic.h"
#include "assignment_policy.h"
Expand Down Expand Up @@ -516,6 +517,10 @@ rocfft_status rocfft_plan_create_internal(rocfft_plan plan,
throw;
}

// plan is compiled, no need to alloc twiddles + kargs etc
if(rocfft_getenv("ROCFFT_INTERNAL_COMPILE_ONLY") == "1")
return rocfft_status_success;

if(!PlanPowX(execPlan)) // PlanPowX enqueues the GPU kernels by function
{

Expand Down
52 changes: 14 additions & 38 deletions library/src/rocfft_aot_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@

using namespace std::placeholders;

#include "../../shared/concurrency.h"
#include "../../shared/environment.h"
#include "../../shared/work_queue.h"
#include "function_pool.h"
#include "rtc_cache.h"
#include "rtc_stockham_gen.h"
Expand All @@ -41,37 +43,12 @@ namespace std
#endif
namespace fs = std::filesystem;

#include <condition_variable>
#include <mutex>
#include <queue>
struct CompileQueue
struct WorkItem
{
struct WorkItem
{
std::string kernel_name;
kernel_src_gen_t generate_src;
};
void push(WorkItem&& i)
{
std::unique_lock<std::mutex> lock(queueMutex);
items.emplace(std::move(i));
emptyWait.notify_all();
}
WorkItem pop()
{
std::unique_lock<std::mutex> lock(queueMutex);
while(items.empty())
emptyWait.wait(lock);
WorkItem item(items.front());
items.pop();
return item;
}

private:
std::queue<WorkItem> items;
std::mutex queueMutex;
std::condition_variable emptyWait;
std::string kernel_name;
kernel_src_gen_t generate_src;
};
typedef WorkQueue<WorkItem> CompileQueue;

// call supplied function with exploded out combinations of
// direction, placement, array types, unitstride-ness, callbacks
Expand All @@ -83,11 +60,9 @@ void stockham_combo(
{
for(auto placement : {rocfft_placement_inplace, rocfft_placement_notinplace})
{
for(auto inArrayType :
{rocfft_array_type_complex_interleaved, rocfft_array_type_complex_planar})
for(auto inArrayType : {rocfft_array_type_complex_interleaved})
{
for(auto outArrayType :
{rocfft_array_type_complex_interleaved, rocfft_array_type_complex_planar})
for(auto outArrayType : {rocfft_array_type_complex_interleaved})
{
// inplace requires same array types
if(placement == rocfft_placement_inplace && inArrayType != outArrayType)
Expand Down Expand Up @@ -151,11 +126,12 @@ void build_stockham_function_pool(CompileQueue& queue)
intrinsic_modes.push_back(ENABLE_LOAD_ONLY);
}

// SBCC can be used with or without large twd. Large
// twd may be base 4, 5, 6, 8. Base 8 can
// be 2 or 3 steps; other bases are always 3 step.
// SBCC can be used with or without large twd. Large twd may be
// base 4, 5, 6, 8. Base 4 is unused since it's only useful up
// to 4k lengths, which we already have single kernels for. Base
// 8 can be 2 or 3 steps; other bases are always 3 step.
static const std::array<size_t, 2> base_steps[]
= {{0, 0}, {4, 3}, {5, 3}, {6, 3}, {8, 2}, {8, 3}};
= {{0, 0}, {5, 3}, {6, 3}, {8, 2}, {8, 3}};
for(auto base_step : base_steps)
{
for(auto intrinsic : intrinsic_modes)
Expand Down Expand Up @@ -254,7 +230,7 @@ int main(int argc, char** argv)

CompileQueue queue;

static const size_t NUM_THREADS = std::thread::hardware_concurrency();
static const size_t NUM_THREADS = rocfft_concurrency();
std::vector<std::thread> threads;
threads.reserve(NUM_THREADS);
for(size_t i = 0; i < NUM_THREADS; ++i)
Expand Down
3 changes: 3 additions & 0 deletions library/src/rocfft_rtc_helper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ int main(int argc, const char* const* argv)
// compile and write code object to stdout
auto code = compile_inprocess(kernel_src, gpu_arch);
std::cout.write(code.data(), code.size());
std::cout.flush();
if(!std::cout.good())
return 1;
return 0;
}
catch(std::exception& e)
Expand Down
4 changes: 4 additions & 0 deletions library/src/rtc_kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "rtc_kernel.h"
#include "../../shared/array_predicate.h"
#include "../../shared/environment.h"
#include "device/generator/stockham_gen.h"

#include "device/kernel-generator-embed.h"
Expand All @@ -32,6 +33,9 @@

RTCKernel::RTCKernel(const std::string& kernel_name, const std::vector<char>& code)
{
// if we're only compiling, no need to actually load the code objects
if(rocfft_getenv("ROCFFT_INTERNAL_COMPILE_ONLY") == "1")
return;
if(hipModuleLoadData(&module, code.data()) != hipSuccess)
throw std::runtime_error("failed to load module");

Expand Down
Loading

0 comments on commit 979a451

Please sign in to comment.