Skip to content

Commit

Permalink
Add config file support for constants and test control (pytorch#4337)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#4337

Now that the tool is getting larger, a configuration file for defining which tests to run and which to skip, as well as specifying some values like thresholds and ranges, comes in handy. This diff adds support for a JSON config file with specifications for each test.

Differential Revision: https://internalfb.com/D60060188
  • Loading branch information
estebanpadilla authored and facebook-github-bot committed Jul 25, 2024
1 parent 1efbca1 commit 8ad7cf4
Show file tree
Hide file tree
Showing 2 changed files with 155 additions and 24 deletions.
43 changes: 43 additions & 0 deletions backends/vulkan/tools/gpuinfo/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
"reg_count": {
"enabled": true,
"threshold": 3,
"compensate": 0.1
},
"buf_cacheline_size": {
"enabled": true,
"threshold": 10,
"compensate": 0.1
},
"buffer_bandwidth": {
"enabled": true,
"range": 134217728,
"nflush": 4,
"nunroll": 16,
"niter": 10
},
"ubo_bandwidth": {
"enabled": true,
"range": 134217728,
"nflush": 4,
"nunroll": 16,
"niter": 10
},
"shared_mem_bandwidth": {
"enabled": true,
"nflush": 4,
"nunroll": 16,
"niter": 10
},
"warp_size": {
"enabled": true,
"threshold": 3,
"compensate": 0.1
},
"tex_bandwidth": {
"enabled": true,
"nflush": 4,
"nunroll": 16,
"niter": 10
}
}
136 changes: 112 additions & 24 deletions backends/vulkan/tools/gpuinfo/src/app.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

#include <executorch/backends/vulkan/runtime/api/api.h>
#include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
#include <folly/json.h>
#include <fstream>
#include <iostream>

#include "stats.h"
Expand All @@ -25,6 +27,7 @@ class App {
uint32_t max_tex_width_;
uint32_t max_tex_height_;
uint32_t max_tex_depth_;
folly::dynamic config_;

public:
App() {
Expand Down Expand Up @@ -66,15 +69,19 @@ class App {
}

void reg_count() {
if (!_enabled("reg_count")) {
std::cout << "Skipped Register Count" << std::endl;
return;
}

std::cout << std::endl;
std::cout << "------ Register Count ------" << std::endl;
const uint32_t NREG_MIN = 1;
const uint32_t NREG_MAX = 512;
const uint32_t NREG_STEP = 1;

// TODO: Make these values configurable
const double COMPENSATE = 0.01;
const double THRESHOLD = 3;
const double COMPENSATE = _get_config("reg_count", "compensate", 0.01);
const double THRESHOLD = _get_config("reg_count", "threshold", 3);

const uint32_t NGRP_MIN = 1;
const uint32_t NGRP_MAX = 64;
Expand Down Expand Up @@ -175,12 +182,17 @@ class App {
}

void buf_cacheline_size() {
if (!_enabled("buf_cacheline_size")) {
std::cout << "Skipped Buffer Cacheline Size" << std::endl;
return;
}

std::cout << std::endl;
std::cout << "------ Buffer Cacheline Size ------" << std::endl;

// TODO: Make these values configurable
const double COMPENSATE = 0.01;
const double THRESHOLD = 10;
const double COMPENSATE =
_get_config("buf_cacheline_size", "compensate", 0.01);
const double THRESHOLD = _get_config("buf_cacheline_size", "threshold", 10);

const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
const uint32_t BUF_SIZE = buf_cache_size_;
Expand Down Expand Up @@ -237,15 +249,24 @@ class App {

private:
void _bandwidth(std::string memtype, uint32_t range) {
// TODO: Make these values configurable
auto memtype_lower = memtype;
std::transform(
memtype_lower.begin(),
memtype_lower.end(),
memtype_lower.begin(),
[](unsigned char c) { return std::tolower(c); });

// Cache lines flushed
const uint32_t NFLUSH = 4;
const uint32_t NFLUSH =
_get_config(memtype_lower + "_bandwidth", "nflush", 4);
// Number of loop unrolls. Changing this value requires an equal change in
// buf_bandwidth.yaml
const uint32_t NUNROLL = 16;
const uint32_t NUNROLL =
_get_config(memtype_lower + "_bandwidth", "nunroll", 16);
// Number of iterations. Increasing this value reduces noise in exchange for
// higher latency.
const uint32_t NITER = 10;
const uint32_t NITER =
_get_config(memtype_lower + "_bandwidth", "niter", 10);
// Vector dimensions (vec4)
const uint32_t VEC_WIDTH = 4;
const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
Expand Down Expand Up @@ -273,12 +294,6 @@ class App {
context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
vkapi::PipelineBarrier pipeline_barrier{};

auto memtype_lower = memtype;
std::transform(
memtype_lower.begin(),
memtype_lower.end(),
memtype_lower.begin(),
[](unsigned char c) { return std::tolower(c); });
auto shader_name = "buf_bandwidth_" + memtype_lower;

auto time = benchmark_on_gpu(shader_name, 10, [&]() {
Expand Down Expand Up @@ -326,29 +341,76 @@ class App {
return {1, D * 4, H, W};
}

float _get_config(
const std::string& test,
const std::string& key,
const float default_value) {
if (config_.empty() || config_[test].empty() ||
!config_[test][key].isDouble()) {
std::cout << "Default value for " << test << "." << key << " = "
<< default_value << std::endl;
return default_value;
}

auto value = config_[test][key].getDouble();
std::cout << "Read value for " << test << "." << key << " = " << value
<< std::endl;
return value;
}

bool _enabled(const std::string& test) {
if (config_.empty() || config_[test].empty() ||
!config_[test]["enabled"].isBool()) {
return true;
}
return config_[test]["enabled"].getBool();
}

public:
void buf_bandwidth() {
if (!_enabled("buffer_bandwidth")) {
std::cout << "Skipped Memory Bandwidth" << std::endl;
return;
}

std::cout << "\n------ Memory Bandwidth ------" << std::endl;
// Maximum memory space read - 128MB
// For regular devices, bandwidth plateaus at less memory than this, so more
// is not needed.
const uint32_t RANGE = 128 * 1024 * 1024;
const uint32_t RANGE =
_get_config("buffer_bandwidth", "range", 128 * 1024 * 1024);
_bandwidth("Buffer", RANGE);
}

void ubo_bandwidth() {
if (!_enabled("ubo_bandwidth")) {
std::cout << "Skipped UBO Bandwidth" << std::endl;
return;
}

std::cout << "\n------ UBO Bandwidth ------" << std::endl;
const uint32_t RANGE = 128 * 1024 * 1024;
const uint32_t RANGE =
_get_config("ubo_bandwidth", "range", 128 * 1024 * 1024);
_bandwidth("UBO", RANGE);
}

void shared_mem_bandwidth() {
if (!_enabled("shared_mem_bandwidth")) {
std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
return;
}

std::cout << "\n------ Shared Bandwidth ------" << std::endl;
const uint32_t RANGE = max_shared_mem_size_;
_bandwidth("Shared", RANGE);
}

void tex_bandwidth() {
if (!_enabled("tex_bandwidth")) {
std::cout << "Skipped Texture Bandwidth" << std::endl;
return;
}

for (int dim = 0; dim < 3; dim++) {
std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
<< std::endl;
Expand All @@ -364,13 +426,13 @@ class App {
const uint32_t RANGE = NVEC * VEC_SIZE;

// Cache lines flushed
const uint32_t NFLUSH = 4;
const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush", 4);
// Number of loop unrolls. Changing this value requires an equal change in
// tex_bandwidth.yaml
const uint32_t NUNROLL = 16;
const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll", 16);
// Number of iterations. Increasing this value reduces noise in exchange
// for higher latency.
const uint32_t NITER = 10;
const uint32_t NITER = _get_config("tex_bandwidth", "niter", 10);
// Number of memory reads per thread
const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
// Number of threads needed to read all texells
Expand Down Expand Up @@ -458,6 +520,11 @@ class App {
// In Case 2, like in Adreno, the driver might decide to pack multiple works
// together and dispatch them at once.
void warp_size(bool verbose = false) {
if (!_enabled("warp_size")) {
std::cout << "Skipped Warp Size" << std::endl;
return;
}

std::cout << "\n------ Warp Size ------" << std::endl;

// Method A: Stress test with a kernel that uses complex ALU operations like
Expand All @@ -467,8 +534,8 @@ class App {
// This timing-based method helps us identify physical warp sizes. It also
// helps with Case 2, when threads of multiple warps are managed by the same
// scheduler at the same time.
const double COMPENSATE = 0.01;
const double THRESHOLD = 3;
const double COMPENSATE = _get_config("warp_size", "compensate", 0.01);
const double THRESHOLD = _get_config("warp_size", "threshold", 3);

uint32_t NITER;

Expand Down Expand Up @@ -591,12 +658,33 @@ class App {
std::cout << "PhysicalWarpSize," << warp_size << std::endl;
std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
}

void load_config(std::string file_path) {
std::ifstream file(file_path);
std::stringstream buffer;
buffer << file.rdbuf();
const std::string json_str = buffer.str();
if (json_str.empty()) {
std::cout << "No config file found." << std::endl;
return;
}
config_ = folly::parseJson(json_str);
}
};

int main(int argc, const char** argv) {
App app;

// TODO: Allow user to skip tests
{
std::string file_path = "config.json";

if (argc > 1) {
file_path = argv[1];
};

app.load_config(file_path);
}

app.reg_count();
app.buf_cacheline_size();
app.buf_bandwidth();
Expand Down

0 comments on commit 8ad7cf4

Please sign in to comment.