Add config file support for constants and test control (pytorch#4337)

Summary: Pull Request resolved: pytorch#4337 Now that the tool is getting larger, a configuration file for defining which tests to run and which to skip, as well as specifying some values like thresholds and ranges, comes in handy. This diff adds support for a JSON config file with specifications for each test. Differential Revision: https://internalfb.com/D60060188
Esteb37 · Jul 25, 2024 · 8ad7cf4 · 8ad7cf4
1 parent 1efbca1
commit 8ad7cf4
Show file tree

Hide file tree

Showing 2 changed files with 155 additions and 24 deletions.
diff --git a/backends/vulkan/tools/gpuinfo/config.json b/backends/vulkan/tools/gpuinfo/config.json
@@ -0,0 +1,43 @@
+{
+  "reg_count": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "buf_cacheline_size": {
+    "enabled": true,
+    "threshold": 10,
+    "compensate": 0.1
+  },
+  "buffer_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "ubo_bandwidth": {
+    "enabled": true,
+    "range": 134217728,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "shared_mem_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  },
+  "warp_size": {
+    "enabled": true,
+    "threshold": 3,
+    "compensate": 0.1
+  },
+  "tex_bandwidth": {
+    "enabled": true,
+    "nflush": 4,
+    "nunroll": 16,
+    "niter": 10
+  }
+}
diff --git a/backends/vulkan/tools/gpuinfo/src/app.cpp b/backends/vulkan/tools/gpuinfo/src/app.cpp
@@ -8,6 +8,8 @@
 
 #include <executorch/backends/vulkan/runtime/api/api.h>
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/StagingUtils.h>
+#include <folly/json.h>
+#include <fstream>
 #include <iostream>
 
 #include "stats.h"
@@ -25,6 +27,7 @@ class App {
   uint32_t max_tex_width_;
   uint32_t max_tex_height_;
   uint32_t max_tex_depth_;
+  folly::dynamic config_;
 
  public:
   App() {
@@ -66,15 +69,19 @@ class App {
   }
 
   void reg_count() {
+    if (!_enabled("reg_count")) {
+      std::cout << "Skipped Register Count" << std::endl;
+      return;
+    }
+
     std::cout << std::endl;
     std::cout << "------ Register Count ------" << std::endl;
     const uint32_t NREG_MIN = 1;
     const uint32_t NREG_MAX = 512;
     const uint32_t NREG_STEP = 1;
 
-    // TODO: Make these values configurable
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 3;
+    const double COMPENSATE = _get_config("reg_count", "compensate", 0.01);
+    const double THRESHOLD = _get_config("reg_count", "threshold", 3);
 
     const uint32_t NGRP_MIN = 1;
     const uint32_t NGRP_MAX = 64;
@@ -175,12 +182,17 @@ class App {
   }
 
   void buf_cacheline_size() {
+    if (!_enabled("buf_cacheline_size")) {
+      std::cout << "Skipped Buffer Cacheline Size" << std::endl;
+      return;
+    }
+
     std::cout << std::endl;
     std::cout << "------ Buffer Cacheline Size ------" << std::endl;
 
-    // TODO: Make these values configurable
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 10;
+    const double COMPENSATE =
+        _get_config("buf_cacheline_size", "compensate", 0.01);
+    const double THRESHOLD = _get_config("buf_cacheline_size", "threshold", 10);
 
     const uint32_t PITCH = buf_cache_size_ / nthread_logic_;
     const uint32_t BUF_SIZE = buf_cache_size_;
@@ -237,15 +249,24 @@ class App {
 
  private:
   void _bandwidth(std::string memtype, uint32_t range) {
-    // TODO: Make these values configurable
+    auto memtype_lower = memtype;
+    std::transform(
+        memtype_lower.begin(),
+        memtype_lower.end(),
+        memtype_lower.begin(),
+        [](unsigned char c) { return std::tolower(c); });
+
     // Cache lines flushed
-    const uint32_t NFLUSH = 4;
+    const uint32_t NFLUSH =
+        _get_config(memtype_lower + "_bandwidth", "nflush", 4);
     // Number of loop unrolls. Changing this value requires an equal change in
     // buf_bandwidth.yaml
-    const uint32_t NUNROLL = 16;
+    const uint32_t NUNROLL =
+        _get_config(memtype_lower + "_bandwidth", "nunroll", 16);
     // Number of iterations. Increasing this value reduces noise in exchange for
     // higher latency.
-    const uint32_t NITER = 10;
+    const uint32_t NITER =
+        _get_config(memtype_lower + "_bandwidth", "niter", 10);
     // Vector dimensions (vec4)
     const uint32_t VEC_WIDTH = 4;
     const uint32_t VEC_SIZE = VEC_WIDTH * sizeof(float);
@@ -273,12 +294,6 @@ class App {
           context(), vkapi::kFloat, VEC_WIDTH * nthread_logic_);
       vkapi::PipelineBarrier pipeline_barrier{};
 
-      auto memtype_lower = memtype;
-      std::transform(
-          memtype_lower.begin(),
-          memtype_lower.end(),
-          memtype_lower.begin(),
-          [](unsigned char c) { return std::tolower(c); });
       auto shader_name = "buf_bandwidth_" + memtype_lower;
 
       auto time = benchmark_on_gpu(shader_name, 10, [&]() {
@@ -326,29 +341,76 @@ class App {
     return {1, D * 4, H, W};
   }
 
+  float _get_config(
+      const std::string& test,
+      const std::string& key,
+      const float default_value) {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test][key].isDouble()) {
+      std::cout << "Default value for " << test << "." << key << " = "
+                << default_value << std::endl;
+      return default_value;
+    }
+
+    auto value = config_[test][key].getDouble();
+    std::cout << "Read value for " << test << "." << key << " = " << value
+              << std::endl;
+    return value;
+  }
+
+  bool _enabled(const std::string& test) {
+    if (config_.empty() || config_[test].empty() ||
+        !config_[test]["enabled"].isBool()) {
+      return true;
+    }
+    return config_[test]["enabled"].getBool();
+  }
+
  public:
   void buf_bandwidth() {
+    if (!_enabled("buffer_bandwidth")) {
+      std::cout << "Skipped Memory Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Memory Bandwidth ------" << std::endl;
     // Maximum memory space read - 128MB
     // For regular devices, bandwidth plateaus at less memory than this, so more
     // is not needed.
-    const uint32_t RANGE = 128 * 1024 * 1024;
+    const uint32_t RANGE =
+        _get_config("buffer_bandwidth", "range", 128 * 1024 * 1024);
     _bandwidth("Buffer", RANGE);
   }
 
   void ubo_bandwidth() {
+    if (!_enabled("ubo_bandwidth")) {
+      std::cout << "Skipped UBO Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ UBO Bandwidth ------" << std::endl;
-    const uint32_t RANGE = 128 * 1024 * 1024;
+    const uint32_t RANGE =
+        _get_config("ubo_bandwidth", "range", 128 * 1024 * 1024);
     _bandwidth("UBO", RANGE);
   }
 
   void shared_mem_bandwidth() {
+    if (!_enabled("shared_mem_bandwidth")) {
+      std::cout << "Skipped Shared Memory Bandwidth" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Shared Bandwidth ------" << std::endl;
     const uint32_t RANGE = max_shared_mem_size_;
     _bandwidth("Shared", RANGE);
   }
 
   void tex_bandwidth() {
+    if (!_enabled("tex_bandwidth")) {
+      std::cout << "Skipped Texture Bandwidth" << std::endl;
+      return;
+    }
+
     for (int dim = 0; dim < 3; dim++) {
       std::cout << "\n------ Texture Bandwidth (Dim = " << dim << ") ------"
                 << std::endl;
@@ -364,13 +426,13 @@ class App {
       const uint32_t RANGE = NVEC * VEC_SIZE;
 
       // Cache lines flushed
-      const uint32_t NFLUSH = 4;
+      const uint32_t NFLUSH = _get_config("tex_bandwidth", "nflush", 4);
       // Number of loop unrolls. Changing this value requires an equal change in
       // tex_bandwidth.yaml
-      const uint32_t NUNROLL = 16;
+      const uint32_t NUNROLL = _get_config("tex_bandwidth", "nunroll", 16);
       // Number of iterations. Increasing this value reduces noise in exchange
       // for higher latency.
-      const uint32_t NITER = 10;
+      const uint32_t NITER = _get_config("tex_bandwidth", "niter", 10);
       // Number of memory reads per thread
       const uint32_t NREAD_PER_THREAD = NUNROLL * NITER;
       // Number of threads needed to read all texells
@@ -458,6 +520,11 @@ class App {
   // In Case 2, like in Adreno, the driver might decide to pack multiple works
   // together and dispatch them at once.
   void warp_size(bool verbose = false) {
+    if (!_enabled("warp_size")) {
+      std::cout << "Skipped Warp Size" << std::endl;
+      return;
+    }
+
     std::cout << "\n------ Warp Size ------" << std::endl;
 
     // Method A: Stress test with a kernel that uses complex ALU operations like
@@ -467,8 +534,8 @@ class App {
     // This timing-based method helps us identify physical warp sizes. It also
     // helps with Case 2, when threads of multiple warps are managed by the same
     // scheduler at the same time.
-    const double COMPENSATE = 0.01;
-    const double THRESHOLD = 3;
+    const double COMPENSATE = _get_config("warp_size", "compensate", 0.01);
+    const double THRESHOLD = _get_config("warp_size", "threshold", 3);
 
     uint32_t NITER;
 
@@ -591,12 +658,33 @@ class App {
     std::cout << "PhysicalWarpSize," << warp_size << std::endl;
     std::cout << "SMWarpSize," << warp_size_scheduler << std::endl;
   }
+
+  void load_config(std::string file_path) {
+    std::ifstream file(file_path);
+    std::stringstream buffer;
+    buffer << file.rdbuf();
+    const std::string json_str = buffer.str();
+    if (json_str.empty()) {
+      std::cout << "No config file found." << std::endl;
+      return;
+    }
+    config_ = folly::parseJson(json_str);
+  }
 };
 
 int main(int argc, const char** argv) {
   App app;
 
-  // TODO: Allow user to skip tests
+  {
+    std::string file_path = "config.json";
+
+    if (argc > 1) {
+      file_path = argv[1];
+    };
+
+    app.load_config(file_path);
+  }
+
   app.reg_count();
   app.buf_cacheline_size();
   app.buf_bandwidth();