-
Hi There! I have a board with RISC-V CPU which supports only RVV 0.7.1 (AllWinner D1 aka XuanTie C906). Current LLVM is 1.0 for a long time and I'm trying to find a working configuration of Halide+LLVM which will support vectorization on a real RISC-V CPU. Let me collect here my observations and probably one can help me. Will also update this page with actual state. General setup
Example of reason: Something workingSimple read-write compiled to a static library with const int width = 1920;
const int height = 1080;
Buffer<uint8_t> input(width, height, 3);
Func f("func");
Var x("x"), y("y");
f(x, y) = input(x, y, 0);
f.bound(x, 0, width).bound(y, 0, height);
f.vectorize(x, 16); LLVM: XUANTIE-RV/llvm-project@814659c + patch to disable --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1503,10 +1503,11 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
// allows us to choose betwen VSETIVLI or VSETVLI later.
bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
auto *C = dyn_cast<ConstantSDNode>(N);
- if (C && isUInt<5>(C->getZExtValue()))
- VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
- N->getValueType(0));
- else
+ // if (C && isUInt<5>(C->getZExtValue())) {
+ // VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
+ // N->getValueType(0));
+ // }
+ // else
VL = N;
return true; Halide: 9e5c5ce with the following patch: diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fd714980d..ae46a40c0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -527,7 +527,7 @@ endif ()
if (BUILD_SHARED_LIBS)
message(STATUS "Building autoschedulers enabled")
- add_subdirectory(autoschedulers)
+ # add_subdirectory(autoschedulers)
else ()
message(STATUS "Building autoschedulers disabled (static Halide)")
endif ()
diff --git a/src/CodeGen_RISCV.cpp b/src/CodeGen_RISCV.cpp
index 434105724..82005ce25 100644
--- a/src/CodeGen_RISCV.cpp
+++ b/src/CodeGen_RISCV.cpp
@@ -52,6 +52,7 @@ string CodeGen_RISCV::mattrs() const {
if (target.has_feature(Target::RVV)) {
arch_flags += ",+experimental-v";
}
+ arch_flags += ",+xtheadc";
return arch_flags;
} Problem: Other results
|
Beta Was this translation helpful? Give feedback.
Replies: 3 comments 6 replies
-
Tagging @zvookin |
Beta Was this translation helpful? Give feedback.
-
Have some progress. Below is a quite limited but working example with RGB to Grayscale image conversion. Works 72ms without RVV and 14ms with RVV on C906 CPU:
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -1499,11 +1499,11 @@ bool RISCVDAGToDAGISel::selectZExti32(SDValue N, SDValue &Val) {
// Select VL as a 5 bit immediate or a value that will become a register. This
// allows us to choose betwen VSETIVLI or VSETVLI later.
bool RISCVDAGToDAGISel::selectVLOp(SDValue N, SDValue &VL) {
- auto *C = dyn_cast<ConstantSDNode>(N);
- if (C && isUInt<5>(C->getZExtValue()))
- VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
- N->getValueType(0));
- else
+ // auto *C = dyn_cast<ConstantSDNode>(N);
+ // if (C && isUInt<5>(C->getZExtValue()))
+ // VL = CurDAG->getTargetConstant(C->getZExtValue(), SDLoc(N),
+ // N->getValueType(0));
+ // else
VL = N;
return true;
--- a/dependencies/llvm/CMakeLists.txt
+++ b/dependencies/llvm/CMakeLists.txt
@@ -105,7 +105,7 @@ add_library(Halide_LLVM INTERFACE)
add_library(Halide::LLVM ALIAS Halide_LLVM)
set_target_properties(Halide_LLVM PROPERTIES EXPORT_NAME LLVM)
-target_include_directories(Halide_LLVM INTERFACE $<BUILD_INTERFACE:${LLVM_INCLUDE_DIRS}>)
+target_include_directories(Halide_LLVM INTERFACE "$<BUILD_INTERFACE:${LLVM_INCLUDE_DIRS}>")
target_compile_definitions(Halide_LLVM INTERFACE ${Halide_LLVM_DEFS})
# Link LLVM libraries to Halide_LLVM, depending on shared, static, or bundled selection.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8b3be6a07..fc636a636 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -509,7 +509,7 @@ endif ()
if (BUILD_SHARED_LIBS)
message(STATUS "Building autoschedulers enabled")
- add_subdirectory(autoschedulers)
+ # add_subdirectory(autoschedulers)
else ()
message(STATUS "Building autoschedulers disabled (static Halide)")
endif ()
diff --git a/src/JITModule.cpp b/src/JITModule.cpp
index 0e3837ec6..245f885ff 100644
--- a/src/JITModule.cpp
+++ b/src/JITModule.cpp
@@ -295,9 +295,9 @@ void JITModule::compile_module(std::unique_ptr<llvm::Module> m, const string &fu
// Do any target-specific initialization
std::vector<llvm::JITEventListener *> listeners;
- if (target.arch == Target::X86) {
- listeners.push_back(llvm::JITEventListener::createIntelJITEventListener());
- }
+ // if (target.arch == Target::X86) {
+ // listeners.push_back(llvm::JITEventListener::createIntelJITEventListener());
+ // }
// TODO: If this ever works in LLVM, this would allow profiling of JIT code with symbols with oprofile.
//listeners.push_back(llvm::createOProfileJITEventListener());
diff --git a/src/LLVM_Headers.h b/src/LLVM_Headers.h
index 555c49368..b50fb89d2 100644
--- a/src/LLVM_Headers.h
+++ b/src/LLVM_Headers.h
@@ -25,7 +25,7 @@
#include <lld/Common/Driver.h>
#endif
-#include <llvm/ExecutionEngine/JITEventListener.h>
+// #include <llvm/ExecutionEngine/JITEventListener.h>
#include <llvm/ExecutionEngine/MCJIT.h>
#include <llvm/ExecutionEngine/SectionMemoryManager.h>
export HL_LLVM_ARGS="-riscv-v-vector-bits-min 128"
g++ main.cpp -I$(realpath $HOME/halide-build/include) -L$(realpath $HOME/halide-build/src) -lHalide -std=c++17 -o app
./app #include <Halide.h>
using namespace Halide;
const int width = 1920;
const int height = 1080;
int main(int argc, char** argv) {
static Func f("bgr2gray");
int32_t R2GRAY = 77.0f, G2GRAY = 150, B2GRAY = 29;
Buffer<uint16_t> input(width, height, 3);
Var x("x"), y("y"), c("c");
Expr r = input(x, y, 0);
Expr g = input(x, y, 1);
Expr b = input(x, y, 2);
Expr res = (R2GRAY * r + G2GRAY * g + B2GRAY * b) >> 8;
f(x, y) = res;
f.bound(x, 0, width).bound(y, 0, height);
f.vectorize(x, 8);
// Compile
Target target;
target.os = Target::OS::Linux;
target.arch = Target::Arch::RISCV;
target.bits = 64;
std::vector<Target::Feature> features;
features.push_back(Target::RVV);
features.push_back(Target::NoAsserts);
features.push_back(Target::NoRuntime);
target.set_features(features);
std::cout << target << std::endl;
f.print_loop_nest();
try {
f.compile_to_header("bgr2gray.h", {input}, "bgr2gray", target);
f.compile_to_assembly("bgr2gray.s", {input}, "bgr2gray", target);
} catch(Halide::InternalError& ex) {
std::cout << ex.what() << std::endl;
} catch(Halide::CompileError& ex) {
std::cout << ex.what() << std::endl;
}
return 0;
}
with open('bgr2gray.s', 'rt') as f:
asm = f.read()
asm = asm.replace('v0p10', 'v0p7')
# https://github.com/llvm/llvm-project/commit/47a4a27f47203055a4700b65533262409f83c491
asm = asm.replace('vle8', 'vlbu')
asm = asm.replace('vle16', 'vlhu')
asm = asm.replace('vle32', 'vlwu')
asm = asm.replace('vse8', 'vsb')
asm = asm.replace('vse16', 'vsh')
asm = asm.replace('vse32', 'vsw')
asm = asm.replace('vse64', 'vsw')
asm = asm.replace(', ta, mu', '')
with open('bgr2gray_071.s', 'wt') as f:
f.write(asm)
riscv64-unknown-linux-gnu-g++ -march=rv64gcv0p7 -mabi=lp64d -c bgr2gray_071.s -o bgr2gray.o
riscv64-unknown-linux-gnu-g++ -march=rv64gcv0p7 -mabi=lp64d runner.cpp bgr2gray.o -I $HOME/halide-build/include/ -std=c++17 -o runner // runner.cpp
#include <vector>
#include <iostream>
#include "bgr2gray.h"
#include "HalideBuffer.h"
#include <chrono>
static const int width = 1920;
static const int height = 1080;
int main(int argc, char** argv) {
std::vector<uint16_t> inpData(height * width * 3);
std::vector<uint16_t> outData(height * width, 1);
for (int i = 0; i < inpData.size(); ++i)
inpData[i] = i % 256;
Halide::Runtime::Buffer<uint16_t> input(inpData.data(), {width, height, 3});
Halide::Runtime::Buffer<uint16_t> output(outData.data(), {width, height});
std::vector<int> times;
for (int i = 0; i < 1000; ++i) {
auto t1 = std::chrono::high_resolution_clock::now();
bgr2gray(input, output);
auto t2 = std::chrono::high_resolution_clock::now();
times.push_back(std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1).count());
}
std::sort(times.begin(), times.end());
std::cout << times[times.size() / 2] << std::endl;
std::cout << "check results" << std::endl;
bool isOK = true;
for (int i = 0; i < width * height; ++i) {
float r = static_cast<float>(inpData[i]);
float g = static_cast<float>(inpData[width * height + i]);
float b = static_cast<float>(inpData[2 * width * height + i]);
uint16_t gray = static_cast<uint16_t>(r * 0.299f + g * 0.587f + b * 0.114f);
if (abs(gray - outData[i]) > 1) {
std::cout << "FAILED CHECK: " << gray << " != " << outData[i] << std::endl;
isOK = false;
}
}
if (isOK) {
std::cout << "SUCCESS" << std::endl;
}
return 0;
} |
Beta Was this translation helpful? Give feedback.
-
I were finally able run a scope of algorithms (rgb2gray, blur, 2D convolution) with some limitations with RVV feature on C906 CPU. Published at https://github.com/YADRO-KNS/halide_riscv. |
Beta Was this translation helpful? Give feedback.
I were finally able run a scope of algorithms (rgb2gray, blur, 2D convolution) with some limitations with RVV feature on C906 CPU. Published at https://github.com/YADRO-KNS/halide_riscv.