diff --git a/ci/regression.sh.in b/ci/regression.sh.in index cb9f07616..4841b2b3b 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -363,8 +363,8 @@ scope() { echo "begin scope tests..." - SCOPE_DEPTH=256 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope - SCOPE_DEPTH=256 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope + SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope + SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope echo "debugging scope done!" } @@ -385,7 +385,7 @@ synthesis() echo "begin synthesis tests..." PREFIX=build_base make -C hw/syn/yosys clean - PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis + PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis echo "synthesis tests done!" } diff --git a/perf/cache/cache_perf.log b/perf/cache/cache_perf.log deleted file mode 100644 index 0a4a55cc8..000000000 --- a/perf/cache/cache_perf.log +++ /dev/null @@ -1,3 +0,0 @@ -CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 make -C ./ci/../driver/rtlsim -verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/softfloat_ext.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so diff --git a/perf/cache/run.sh b/perf/cache/run.sh index ffb86e342..04285c389 100755 --- a/perf/cache/run.sh +++ b/perf/cache/run.sh @@ -10,17 +10,17 @@ sgemm() { echo "begin cache tests" -CONFIGS="-DICACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' > ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DDCACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DICACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DICACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log +CONFIGS="-DICACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' > cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DDCACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DICACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DICACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log echo "cache tests done!" } @@ -36,6 +36,6 @@ case $1 in -h | --help ) usage ;; * ) sgemm - ;; + ;; esac shift \ No newline at end of file diff --git a/sim/common/softfloat_ext.cpp b/sim/common/softfloat_ext.cpp index b1cb8dc65..a9d493b00 100644 --- a/sim/common/softfloat_ext.cpp +++ b/sim/common/softfloat_ext.cpp @@ -148,10 +148,9 @@ static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) { 59, 58, 57, 56, 56, 55, 54, 53}; if (sub) { - while (extract64(sig, s - 1, 1) == 0) { - exp--; - sig <<= 1; - } + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + sig = (sig << 1) & make_mask64(0, s); } @@ -358,9 +357,9 @@ float16_t f16_recip7(float16_t in) { [[fallthrough]]; default: // +- normal uA.ui = recip7(uA.ui, 5, 10, softfloat_roundingMode, sub, &round_abnormal); - if (round_abnormal) - softfloat_exceptionFlags |= - softfloat_flag_inexact | softfloat_flag_overflow; + if (round_abnormal) { + softfloat_exceptionFlags |= softfloat_flag_inexact | softfloat_flag_overflow; + } break; } @@ -401,9 +400,9 @@ float32_t f32_recip7(float32_t in) { [[fallthrough]]; default: // +- normal uA.ui = recip7(uA.ui, 8, 23, softfloat_roundingMode, sub, &round_abnormal); - if (round_abnormal) - softfloat_exceptionFlags |= - softfloat_flag_inexact | softfloat_flag_overflow; + if (round_abnormal) { + softfloat_exceptionFlags |= softfloat_flag_inexact | softfloat_flag_overflow; + } break; } @@ -444,9 +443,9 @@ float64_t f64_recip7(float64_t in) { [[fallthrough]]; default: // +- normal uA.ui = recip7(uA.ui, 11, 52, softfloat_roundingMode, sub, &round_abnormal); - if (round_abnormal) - softfloat_exceptionFlags |= - softfloat_flag_inexact | softfloat_flag_overflow; + if (round_abnormal) { + softfloat_exceptionFlags |= softfloat_flag_inexact | softfloat_flag_overflow; + } break; } diff --git a/sim/simx/execute_v.cpp b/sim/simx/execute_v.cpp index 13c78d79c..15ce0f947 100644 --- a/sim/simx/execute_v.cpp +++ b/sim/simx/execute_v.cpp @@ -44,7 +44,7 @@ template class Madc { public: static R apply(T first, T second, R third) { - return (R)first + (R)second + third > (R)std::numeric_limits::max(); + return ((R)first + (R)second + third) > (R)std::numeric_limits::max(); } static std::string name() { return "Madc"; } }; @@ -62,7 +62,7 @@ template class Msbc { public: static R apply(T first, T second, R third) { - return (R)second < (R)first + third; + return (R)second < ((R)first + third); } static std::string name() { return "Msbc"; } }; @@ -1128,6 +1128,8 @@ class Smul { static std::string name() { return "Smul"; } }; +/////////////////////////////////////////////////////////////////////////////// + bool isMasked(std::vector> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) { auto &mask = vreg_file.at(maskVreg); uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8); @@ -1155,7 +1157,7 @@ DT &getVregData(std::vector> &vreg_file, uint32_t base } template -void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { uint32_t vsew = sizeof(DT) * 8; uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); if (nfields * emul > 8) { @@ -1177,7 +1179,7 @@ void vector_op_vix_load(std::vector> &vreg_file, vortex::Emula } } -void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { switch (vsew) { case 8: vector_op_vix_load(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask); @@ -1198,7 +1200,7 @@ void vector_op_vix_load(std::vector> &vreg_file, vortex::Emula } template -void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { uint32_t vsew = sizeof(DT) * 8; uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); if (nfields * emul > 8) { @@ -1238,7 +1240,7 @@ void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulat } } -void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { switch (vsew) { case 8: vector_op_vv_load(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); @@ -1259,7 +1261,7 @@ void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulat } template -void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { uint32_t vsew = sizeof(DT) * 8; uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); for (uint32_t i = 0; i < vl * nfields; i++) { @@ -1274,7 +1276,7 @@ void vector_op_vix_store(std::vector> &vreg_file, vortex::Emul } } -void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { switch (vsew) { case 8: vector_op_vix_store(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask); @@ -1295,7 +1297,7 @@ void vector_op_vix_store(std::vector> &vreg_file, vortex::Emul } template -void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { uint32_t vsew = sizeof(DT) * 8; uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); for (uint32_t i = 0; i < vl * nfields; i++) { @@ -1328,7 +1330,7 @@ void vector_op_vv_store(std::vector> &vreg_file, vortex::Emula } } -void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, Word base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { switch (vsew) { case 8: vector_op_vv_store(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); @@ -1364,15 +1366,20 @@ void vector_op_vix(DT first, std::vector> &vreg_file, uint32_t template