diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 849a8769f..d6db10074 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -105,7 +105,7 @@ regression() ./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3 # test for matmul - CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" + CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" echo "regression tests done!" } @@ -322,6 +322,10 @@ config2() CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress + # test memory ports + CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo + CONFIGS="-DPLATFORM_MEMORY_BANKS=2" ./ci/blackbox.sh --driver=simx --app=demo --threads=32 + echo "configuration-2 tests done!" } diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 29eb5c9d8..187c735db 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -14,8 +14,6 @@ `ifndef VX_CONFIG_VH `define VX_CONFIG_VH - - `ifndef MIN `define MIN(x, y) (((x) < (y)) ? (x) : (y)) `endif @@ -170,8 +168,8 @@ `define L3_LINE_SIZE `MEM_BLOCK_SIZE `endif -`ifndef MEMORY_BANKS -`define MEMORY_BANKS 2 +`ifndef PLATFORM_MEMORY_BANKS +`define PLATFORM_MEMORY_BANKS 1 `endif `ifdef XLEN_64 @@ -193,7 +191,7 @@ `endif `ifdef VM_ENABLE -`ifndef PAGE_TABLE_BASE_ADDR +`ifndef PAGE_TABLE_BASE_ADDR `define PAGE_TABLE_BASE_ADDR 64'h0F0000000 `endif @@ -218,7 +216,7 @@ `endif `ifdef VM_ENABLE -`ifndef PAGE_TABLE_BASE_ADDR +`ifndef PAGE_TABLE_BASE_ADDR `define PAGE_TABLE_BASE_ADDR 32'hF0000000 `endif @@ -303,13 +301,13 @@ `ifndef VM_ADDR_MODE `define VM_ADDR_MODE SV32 //or BARE `endif - `ifndef PT_LEVEL + `ifndef PT_LEVEL `define PT_LEVEL (2) `endif `ifndef PTE_SIZE `define PTE_SIZE (4) `endif - `ifndef NUM_PTE_ENTRY + `ifndef NUM_PTE_ENTRY `define NUM_PTE_ENTRY (1024) `endif `ifndef PT_SIZE_LIMIT @@ -319,13 +317,13 @@ `ifndef VM_ADDR_MODE `define VM_ADDR_MODE SV39 //or BARE `endif - `ifndef PT_LEVEL + `ifndef PT_LEVEL `define PT_LEVEL (3) `endif `ifndef PTE_SIZE `define PTE_SIZE (8) `endif - `ifndef NUM_PTE_ENTRY + `ifndef NUM_PTE_ENTRY `define NUM_PTE_ENTRY (512) `endif `ifndef PT_SIZE_LIMIT @@ -604,7 +602,7 @@ // Number of Banks `ifndef DCACHE_NUM_BANKS -`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4) +`define DCACHE_NUM_BANKS `MIN(DCACHE_NUM_REQS, 16) `endif // Core Response Queue Size @@ -647,6 +645,15 @@ `define DCACHE_REPL_POLICY 1 `endif +// Number of Memory Ports +`ifndef L1_MEM_PORTS +`ifdef L1_DISABLE +`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_BANKS) +`else +`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_BANKS) +`endif +`endif + // LMEM Configurable Knobs //////////////////////////////////////////////////// `ifndef LMEM_DISABLE @@ -674,7 +681,7 @@ // Number of Banks `ifndef L2_NUM_BANKS -`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS) +`define L2_NUM_BANKS `MIN(L2_NUM_REQS, 16) `endif // Core Response Queue Size @@ -717,6 +724,15 @@ `define L2_REPL_POLICY 1 `endif +// Number of Memory Ports +`ifndef L2_MEM_PORTS +`ifdef L2_ENABLE +`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_BANKS) +`else +`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_BANKS) +`endif +`endif + // L3cache Configurable Knobs ///////////////////////////////////////////////// // Cache Size @@ -726,7 +742,7 @@ // Number of Banks `ifndef L3_NUM_BANKS -`define L3_NUM_BANKS `MIN(8, `NUM_CLUSTERS) +`define L3_NUM_BANKS `MIN(L3_NUM_REQS, 16) `endif // Core Response Queue Size @@ -769,9 +785,13 @@ `define L3_REPL_POLICY 1 `endif -// Number of Memory Ports from LLC -`ifndef NUM_MEM_PORTS -`define NUM_MEM_PORTS `MIN(`MEMORY_BANKS, `L3_NUM_BANKS) +// Number of Memory Ports +`ifndef L3_MEM_PORTS +`ifdef L3_ENABLE +`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_BANKS) +`else +`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_BANKS) +`endif `endif // ISA Extensions ///////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index d874b9b2b..08a2f6ca5 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -163,6 +163,7 @@ endgenerate `define USE_BLOCK_BRAM (* ramstyle = "block" *) `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) +`define RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams on" *) `define DISABLE_BRAM (* ramstyle = "logic" *) `define PRESERVE_NET (* preserve *) `define BLACKBOX_CELL (* black_box *) @@ -173,6 +174,7 @@ endgenerate `define USE_BLOCK_BRAM (* ram_style = "block" *) `define USE_FAST_BRAM (* ram_style = "distributed" *) `define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *) +`define RW_RAM_CHECK (* rw_addr_collision = "yes" *) `define DISABLE_BRAM (* ram_style = "registers" *) `define PRESERVE_NET (* keep = "true" *) `define BLACKBOX_CELL (* black_box *) @@ -183,6 +185,7 @@ endgenerate `define USE_BLOCK_BRAM `define USE_FAST_BRAM `define NO_RW_RAM_CHECK +`define RW_RAM_CHECK `define DISABLE_BRAM `define PRESERVE_NET `define BLACKBOX_CELL diff --git a/hw/rtl/core/VX_mem_unit.sv b/hw/rtl/core/VX_mem_unit.sv index 931ad65cd..98491e73d 100644 --- a/hw/rtl/core/VX_mem_unit.sv +++ b/hw/rtl/core/VX_mem_unit.sv @@ -47,7 +47,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #( for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches VX_lmem_switch #( - .REQ0_OUT_BUF (3), + .REQ0_OUT_BUF (1), .REQ1_OUT_BUF (0), .RSP_OUT_BUF (1), .ARBITER ("P") @@ -78,7 +78,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #( .TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH), .ARBITER ("P"), .REQ_OUT_BUF (3), - .RSP_OUT_BUF (0) + .RSP_OUT_BUF (2) ) lmem_adapter ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_async_ram_patch.sv b/hw/rtl/libs/VX_async_ram_patch.sv index fd29e881d..43e8139e6 100644 --- a/hw/rtl/libs/VX_async_ram_patch.sv +++ b/hw/rtl/libs/VX_async_ram_patch.sv @@ -13,12 +13,6 @@ `include "VX_platform.vh" -`define RAM_WRITE_WREN for (integer i = 0; i < WRENW; ++i) begin \ - if (wren[i]) begin \ - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ - end \ - end - `define RAM_INITIALIZATION \ if (INIT_ENABLE != 0) begin : g_init \ if (INIT_FILE != "") begin : g_file \ @@ -32,14 +26,93 @@ end \ end -`define RAM_BYPASS(__d) \ - reg [DATAW-1:0] bypass_data_r; \ - reg bypass_valid_r; \ +`define SYNC_RAM_WF_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [ADDRW-1:0] raddr_r; \ + always @(posedge clk) begin \ + if (__re || __we) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + raddr_r <= __ra; \ + end \ + end \ + assign __d = ram[raddr_r] + +`define SYNC_RAM_WF_WREN_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [ADDRW-1:0] raddr_r; \ always @(posedge clk) begin \ - bypass_valid_r <= read_s && write && (raddr_s == waddr); \ - bypass_data_r <= wdata; \ + if (__re || __we) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + raddr_r <= __ra; \ + end \ end \ - assign __d = bypass_valid_r ? bypass_data_r : rdata_r + assign __d = ram[raddr_r] + +`define SYNC_RAM_RF_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [DATAW-1:0] rdata_r; \ + always @(posedge clk) begin \ + if (__re || __we) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + rdata_r <= ram[__ra]; \ + end \ + end \ + assign __d = rdata_r + +`define SYNC_RAM_RF_WREN_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [DATAW-1:0] rdata_r; \ + always @(posedge clk) begin \ + if (__re || __we) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + rdata_r <= ram[__ra]; \ + end \ + end \ + assign __d = rdata_r + +`define ASYNC_RAM_BLOCK(__d, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + always @(posedge clk) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + end \ + assign __d = ram[__ra] + +`define ASYNC_RAM_BLOCK_WREN(__d, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + always @(posedge clk) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + end \ + assign __d = ram[__ra] `TRACING_OFF module VX_async_ram_patch #( @@ -47,6 +120,8 @@ module VX_async_ram_patch #( parameter SIZE = 1, parameter WRENW = 1, parameter DUAL_PORT = 0, + parameter FORCE_BRAM = 0, + parameter WRITE_FIRST = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -79,77 +154,102 @@ module VX_async_ram_patch #( .out ({raddr_s, read_s, is_raddr_reg}) ); - // synchroneous ram - - wire [DATAW-1:0] rdata_s; + wire [DATAW-1:0] rdata_s, rdata_a; - if (WRENW != 1) begin : g_wren_sync_ram - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - reg [DATAW-1:0] rdata_r; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (read_s || write) begin - if (write) begin - `RAM_WRITE_WREN + if (1) begin : g_sync_ram + if (WRENW != 1) begin : g_wren + if (FORCE_BRAM) begin : g_bram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_WF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_RF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_lutram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES + `SYNC_RAM_WF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES + `SYNC_RAM_RF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES end - rdata_r <= ram[raddr_s]; end - end - `RAM_BYPASS(rdata_s); - end else begin : g_no_wren_sync_ram - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; - reg [DATAW-1:0] rdata_r; - `RAM_INITIALIZATION - `UNUSED_VAR (wren) - always @(posedge clk) begin - if (read_s || write) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_no_wren + if (FORCE_BRAM) begin : g_bram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_WF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_RF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_lutram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES + `SYNC_RAM_WF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES + `SYNC_RAM_RF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES end - rdata_r <= ram[raddr_s]; end end - `RAM_BYPASS(rdata_s); end - // asynchronous ram (fallback) - - wire [DATAW-1:0] rdata_a; - - if (DUAL_PORT != 0) begin : g_dp_async_ram - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - if (WRENW != 1) begin : g_wren - always @(posedge clk) begin - if (write) begin - `RAM_WRITE_WREN + if (1) begin : g_async_ram + if (DUAL_PORT != 0) begin : g_dp + if (WRENW != 1) begin : g_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES end - end - end else begin : g_no_wren - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_no_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES end end - end - assign rdata_a = ram[raddr]; - end else begin : g_sp_async_ram - reg [DATAW-1:0] ram [0:SIZE-1]; - `RAM_INITIALIZATION - if (WRENW != 1) begin : g_wren - always @(posedge clk) begin - if (write) begin - `RAM_WRITE_WREN + end else begin : g_sp + if (WRENW != 1) begin : g_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES end - end - end else begin : g_no_wren - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_no_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES end end end - assign rdata_a = ram[waddr]; end assign rdata = is_raddr_reg ? rdata_s : rdata_a; diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 0cff67882..2cb88efe5 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -80,7 +80,7 @@ module VX_dp_ram #( if (FORCE_BRAM) begin : g_bram if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - (* rw_addr_collision = "yes" *) `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -93,7 +93,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr_r]; end else begin : g_no_wren - (* rw_addr_collision = "yes" *) `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -166,7 +166,7 @@ module VX_dp_ram #( end else begin : g_auto if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - (* rw_addr_collision = "yes" *) `RAM_ARRAY_WREN + `RW_RAM_CHECK `RAM_ARRAY_WREN `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -179,7 +179,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr_r]; end else begin : g_no_wren - (* rw_addr_collision = "yes" *) reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION reg [ADDRW-1:0] raddr_r; always @(posedge clk) begin @@ -220,7 +220,7 @@ module VX_dp_ram #( end assign rdata = rdata_r; end - end else begin + end else begin : g_undefined if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION @@ -253,30 +253,32 @@ module VX_dp_ram #( end else begin : g_async `UNUSED_VAR (read) if (FORCE_BRAM) begin : g_bram + `ifdef VIVADO + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (1), + .FORCE_BRAM (FORCE_BRAM), + .WRITE_FIRST(RDW_MODE == "W"), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (waddr), + .wdata (wdata), + .raddr (raddr), + .rdata (rdata) + ); + `else if (RDW_MODE == "W") begin : g_write_first - `ifdef VIVADO - VX_async_ram_patch #( - .DATAW (DATAW), - .SIZE (SIZE), - .WRENW (WRENW), - .DUAL_PORT (1), - .INIT_ENABLE(INIT_ENABLE), - .INIT_FILE (INIT_FILE), - .INIT_VALUE (INIT_VALUE) - ) async_ram_patch ( - .clk (clk), - .reset (reset), - .read (read), - .write (write), - .wren (wren), - .waddr (waddr), - .wdata (wdata), - .raddr (raddr), - .rdata (rdata) - ); - `else if (WRENW != 1) begin : g_wren - `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -285,7 +287,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr]; end else begin : g_no_wren - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -294,7 +296,6 @@ module VX_dp_ram #( end assign rdata = ram[raddr]; end - `endif end else begin : g_read_first if (WRENW != 1) begin : g_wren `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN @@ -316,10 +317,11 @@ module VX_dp_ram #( assign rdata = ram[raddr]; end end + `endif end else begin : g_auto if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - `RAM_ARRAY_WREN + `RW_RAM_CHECK `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -328,7 +330,7 @@ module VX_dp_ram #( end assign rdata = ram[raddr]; end else begin : g_no_wren - reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index 720a1a2c6..c7a4aab6d 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -90,9 +90,6 @@ module VX_fifo_queue #( end end - wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); - wire bypass = push && (empty || (going_empty && pop)); - VX_dp_ram #( .DATAW (DATAW), .SIZE (DEPTH), @@ -101,7 +98,7 @@ module VX_fifo_queue #( ) dp_ram ( .clk (clk), .reset (reset), - .read (~bypass), + .read (1'b1), .write (push), .wren (1'b1), .raddr (rd_ptr_r), @@ -112,6 +109,8 @@ module VX_fifo_queue #( if (OUT_REG != 0) begin : g_out_reg reg [DATAW-1:0] data_out_r; + wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); + wire bypass = push && (empty || (going_empty && pop)); always @(posedge clk) begin if (bypass) begin data_out_r <= data_in; diff --git a/hw/rtl/libs/VX_rr_arbiter.sv b/hw/rtl/libs/VX_rr_arbiter.sv index 1d3b479bf..c86da584a 100644 --- a/hw/rtl/libs/VX_rr_arbiter.sv +++ b/hw/rtl/libs/VX_rr_arbiter.sv @@ -485,7 +485,7 @@ module VX_rr_arbiter #( .D (NUM_REQS) ) grant_decoder ( .sel_in (grant_index), - .data_in (1'b1), + .data_in (grant_valid), .data_out (grant_onehot) ); diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index 88b922384..3c673e462 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -77,20 +77,20 @@ module VX_sp_ram #( localparam FORCE_BRAM = !LUTRAM && (SIZE * DATAW >= `MAX_LUTRAM); if (OUT_REG) begin : g_sync if (FORCE_BRAM) begin : g_bram - if (RDW_MODE == "R") begin : g_read_first + if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [DATAW-1:0] rdata_r; + reg [ADDRW-1:0] addr_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - rdata_r <= ram[addr]; + addr_r <= addr; end end - assign rdata = rdata_r; + assign rdata = ram[addr_r]; end else begin : g_no_wren `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -99,26 +99,28 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; end - rdata_r <= ram[addr]; end end assign rdata = rdata_r; end - end else if (RDW_MODE == "W") begin : g_write_first + end else if (RDW_MODE == "R") begin : g_read_first if (WRENW != 1) begin : g_wren `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [ADDRW-1:0] addr_r; + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - addr_r <= addr; + rdata_r <= ram[addr]; end end - assign rdata = ram[addr_r]; + assign rdata = rdata_r; end else begin : g_no_wren `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -127,10 +129,8 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; - rdata_r <= wdata; - end else begin - rdata_r <= ram[addr]; end + rdata_r <= ram[addr]; end end assign rdata = rdata_r; @@ -165,7 +165,7 @@ module VX_sp_ram #( end assign rdata = rdata_r; end - end else if (RDW_MODE == "U") begin : g_unknown + end else if (RDW_MODE == "U") begin : g_undefined if (WRENW != 1) begin : g_wren `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION @@ -195,20 +195,20 @@ module VX_sp_ram #( end end end else begin : g_auto - if (RDW_MODE == "R") begin : g_read_first + if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [DATAW-1:0] rdata_r; + reg [ADDRW-1:0] addr_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - rdata_r <= ram[addr]; + addr_r <= addr; end end - assign rdata = rdata_r; + assign rdata = ram[addr_r]; end else begin : g_no_wren reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -217,26 +217,28 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; end - rdata_r <= ram[addr]; end end assign rdata = rdata_r; end - end else if (RDW_MODE == "W") begin : g_write_first + end else if (RDW_MODE == "R") begin : g_read_first if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION - reg [ADDRW-1:0] addr_r; + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (read || write) begin if (write) begin `RAM_WRITE_WREN end - addr_r <= addr; + rdata_r <= ram[addr]; end end - assign rdata = ram[addr_r]; + assign rdata = rdata_r; end else begin : g_no_wren reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION @@ -245,10 +247,8 @@ module VX_sp_ram #( if (read || write) begin if (write) begin ram[addr] <= wdata; - rdata_r <= wdata; - end else begin - rdata_r <= ram[addr]; end + rdata_r <= ram[addr]; end end assign rdata = rdata_r; @@ -283,7 +283,7 @@ module VX_sp_ram #( end assign rdata = rdata_r; end - end else if (RDW_MODE == "U") begin : g_unknown + end else if (RDW_MODE == "U") begin : g_undefined if (WRENW != 1) begin : g_wren `RAM_ARRAY_WREN `RAM_INITIALIZATION @@ -316,30 +316,32 @@ module VX_sp_ram #( end else begin : g_async `UNUSED_VAR (read) if (FORCE_BRAM) begin : g_bram + `ifdef VIVADO + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (0), + .FORCE_BRAM (FORCE_BRAM), + .WRITE_FIRST(RDW_MODE == "W"), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (addr), + .wdata (wdata), + .raddr (addr), + .rdata (rdata) + ); + `else if (RDW_MODE == "W") begin : g_write_first - `ifdef VIVADO - VX_async_ram_patch #( - .DATAW (DATAW), - .SIZE (SIZE), - .WRENW (WRENW), - .DUAL_PORT (0), - .INIT_ENABLE(INIT_ENABLE), - .INIT_FILE (INIT_FILE), - .INIT_VALUE (INIT_VALUE) - ) async_ram_patch ( - .clk (clk), - .reset (reset), - .read (read), - .write (write), - .wren (wren), - .waddr (addr), - .wdata (wdata), - .raddr (addr), - .rdata (rdata) - ); - `else if (WRENW != 1) begin : g_wren - `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -348,7 +350,7 @@ module VX_sp_ram #( end assign rdata = ram[addr]; end else begin : g_no_wren - `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -357,7 +359,6 @@ module VX_sp_ram #( end assign rdata = ram[addr]; end - `endif end else begin : g_read_first if (WRENW != 1) begin : g_wren `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN @@ -379,10 +380,11 @@ module VX_sp_ram #( assign rdata = ram[addr]; end end + `endif end else begin : g_auto if (RDW_MODE == "W") begin : g_write_first if (WRENW != 1) begin : g_wren - `RAM_ARRAY_WREN + `RW_RAM_CHECK `RAM_ARRAY_WREN `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -391,7 +393,7 @@ module VX_sp_ram #( end assign rdata = ram[addr]; end else begin : g_no_wren - reg [DATAW-1:0] ram [0:SIZE-1]; + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION always @(posedge clk) begin if (write) begin @@ -443,22 +445,22 @@ module VX_sp_ram #( end if (OUT_REG) begin : g_sync - if (RDW_MODE == "R") begin : g_read_first - reg [DATAW-1:0] rdata_r; + if (RDW_MODE == "W") begin : g_write_first + reg [ADDRW-1:0] addr_r; always @(posedge clk) begin if (read || write) begin - rdata_r <= ram[addr]; + addr_r <= addr; end end - assign rdata = rdata_r; - end else if (RDW_MODE == "W") begin : g_write_first - reg [ADDRW-1:0] addr_r; + assign rdata = ram[addr_r]; + end else if (RDW_MODE == "R") begin : g_read_first + reg [DATAW-1:0] rdata_r; always @(posedge clk) begin if (read || write) begin - addr_r <= addr; + rdata_r <= ram[addr]; end end - assign rdata = ram[addr_r]; + assign rdata = rdata_r; end else if (RDW_MODE == "N") begin : g_no_change reg [DATAW-1:0] rdata_r; always @(posedge clk) begin diff --git a/hw/rtl/libs/VX_stream_buffer.sv b/hw/rtl/libs/VX_stream_buffer.sv index 4b77df83d..ea4467cb3 100644 --- a/hw/rtl/libs/VX_stream_buffer.sv +++ b/hw/rtl/libs/VX_stream_buffer.sv @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// A stream elastic buffer operates at full-bandwidth where fire_in and fire_out can happen simultaneously +// A stream elastic buffer_r operates at full-bandwidth where fire_in and fire_out can happen simultaneously // It has the following benefits: // + full-bandwidth throughput // + ready_in and ready_out are decoupled @@ -45,79 +45,66 @@ module VX_stream_buffer #( assign valid_out = valid_in; assign data_out = data_in; - end else if (OUT_REG != 0) begin : g_out_reg + end else begin : g_buffer - reg [DATAW-1:0] data_out_r; - reg [DATAW-1:0] buffer; - reg valid_out_r; - reg no_buffer; + reg [DATAW-1:0] data_out_r, buffer_r; + reg valid_out_r, valid_in_r; wire fire_in = valid_in && ready_in; wire flow_out = ready_out || ~valid_out; always @(posedge clk) begin if (reset) begin - valid_out_r <= 0; - no_buffer <= 1; - end else begin - if (flow_out) begin - no_buffer <= 1; - end else if (valid_in) begin - no_buffer <= 0; - end - if (flow_out) begin - valid_out_r <= valid_in || ~no_buffer; - end + valid_in_r <= 1'b1; + end else if (valid_in || flow_out) begin + valid_in_r <= flow_out; end end always @(posedge clk) begin - if (fire_in) begin - buffer <= data_in; - end - if (flow_out) begin - data_out_r <= no_buffer ? data_in : buffer; + if (reset) begin + valid_out_r <= 1'b0; + end else if (flow_out) begin + valid_out_r <= valid_in || ~valid_in_r; end end - assign ready_in = no_buffer; - assign valid_out = valid_out_r; - assign data_out = data_out_r; + if (OUT_REG != 0) begin : g_out_reg + + always @(posedge clk) begin + if (fire_in) begin + buffer_r <= data_in; + end + end - end else begin : g_no_out_reg + always @(posedge clk) begin + if (flow_out) begin + data_out_r <= valid_in_r ? data_in : buffer_r; + end + end - reg [1:0][DATAW-1:0] shift_reg; - reg [1:0] fifo_state, fifo_state_n; + assign data_out = data_out_r; - wire fire_in = valid_in && ready_in; - wire fire_out = valid_out && ready_out; - - always @(*) begin - case ({fire_in, fire_out}) - 2'b10: fifo_state_n = {fifo_state[0], 1'b1}; // 00 -> 01, 01 -> 10 - 2'b01: fifo_state_n = {1'b0, fifo_state[1]}; // 10 -> 01, 01 -> 00 - default: fifo_state_n = fifo_state; - endcase - end + end else begin : g_no_out_reg - always @(posedge clk) begin - if (reset) begin - fifo_state <= 2'b00; - end else begin - fifo_state <= fifo_state_n; + always @(posedge clk) begin + if (fire_in) begin + data_out_r <= data_in; + end end - end - always @(posedge clk) begin - if (fire_in) begin - shift_reg[1] <= shift_reg[0]; - shift_reg[0] <= data_in; + always @(posedge clk) begin + if (fire_in) begin + buffer_r <= data_out_r; + end end + + assign data_out = valid_in_r ? data_out_r : buffer_r; + end - assign ready_in = ~fifo_state[1]; - assign valid_out = fifo_state[0]; - assign data_out = shift_reg[fifo_state[1]]; + assign valid_out = valid_out_r; + assign ready_in = valid_in_r; end diff --git a/hw/scripts/xilinx_async_bram_patch.tcl b/hw/scripts/xilinx_async_bram_patch.tcl index 5af7ba953..e4a684e3b 100644 --- a/hw/scripts/xilinx_async_bram_patch.tcl +++ b/hw/scripts/xilinx_async_bram_patch.tcl @@ -1,3 +1,16 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + namespace eval vortex { variable debug 0 @@ -17,6 +30,25 @@ proc str_replace {str match repl} { return $result } +proc regex_escape {str} { + return [string map { + \\ \\\\ + ^ \\^ + . \\. + \[ \\\[ + \] \\\] + \$ \\\$ + \( \\\( + \) \\\) + | \\| + * \\* + + \\+ + ? \\? + \{ \\\{ + \} \\\} + } $str] +} + proc unique_cell_name {name} { if {[get_cells -quiet $name] == {}} { return $name } set index 0 @@ -31,29 +63,58 @@ proc unique_net_name {name} { return ${name}_${index} } -proc find_nested_cells {parent name_match {should_exist 1}} { - set matching_cells {} - foreach cell [get_cells -hierarchical -include_replicated_objects -filter "PARENT == $parent"] { - set name [get_property NAME $cell] - if {[regexp $name_match $name]} { - lappend matching_cells $cell +proc build_parent_child_map {all_cells} { + set parent_child_map {} + foreach cell $all_cells { + set parent [get_property PARENT $cell] + if {$parent ne ""} { + if {[dict exists $parent_child_map $parent]} { + dict lappend parent_child_map $parent $cell + } else { + dict set parent_child_map $parent [list $cell] + } } } - if {[llength $matching_cells] == 0} { - print_error "No matching cell found for '$parent' matching '$name_match'." $should_exist + return $parent_child_map +} + +proc find_cell_descendants_recursive {parent_cell parent_child_map} { + set descendants {} + if {[dict exists $parent_child_map $parent_cell]} { + set children [dict get $parent_child_map $parent_cell] + foreach child $children { + # Add the child to the list + lappend descendants $child + # Recursively add its descendants + set sub_descendants [find_cell_descendants_recursive $child $parent_child_map] + lappend descendants {*}$sub_descendants + } } - return $matching_cells + return $descendants +} + +proc find_cell_descendants {parent_cell} { + set all_cells [get_cells -hierarchical] + set parent_child_map [build_parent_child_map $all_cells] + return [find_cell_descendants_recursive $parent_cell $parent_child_map] } -proc find_nested_cell {parent name_match} { - foreach cell [get_cells -hierarchical -filter "PARENT == $parent"] { - set name [get_property NAME $cell] - if {$name == $name_match} { - return $cell +proc find_nested_cells {parent_cell name_match {should_exist 1}} { + set hier_sep [get_hierarchy_separator] + set matching_cells {} + foreach cell [find_cell_descendants $parent_cell] { + set parent_name [get_property PARENT $cell] + set cell_name [get_property NAME $cell] + set name_prefix [regex_escape "${parent_name}${hier_sep}"] + set pattern "${name_prefix}${name_match}" + if {[regexp $pattern $cell_name]} { + lappend matching_cells $cell } } - puts "ERROR: No matching cell found for '$parent' matching '$name_match'." - exit -1 + if {[llength $matching_cells] == 0} { + print_error "No matching cell found for '$parent_cell' matching '$name_match'." $should_exist + } + return $matching_cells } proc find_cell_nets {cell name_match {should_exist 1}} { @@ -70,22 +131,23 @@ proc find_cell_nets {cell name_match {should_exist 1}} { return $matching_nets } -proc get_cell_net {cell name_match} { - foreach net [get_nets -hierarchical -filter "PARENT_CELL == $cell"] { - set name [get_property NAME $net] - if {$name == $name_match} { - return $net - } +proc get_cell_net {cell name} { + set net [get_nets -hierarchical -filter "PARENT_CELL == $cell && NAME == $name"] + if {[llength $net] == 0} { + puts "ERROR: No matching net found for '$cell' matching '$name'." + exit -1 } - puts "ERROR: No matching net found for '$cell' matching '$name_match'." - exit -1 + return $net; } proc find_cell_pins {cell name_match {should_exist 1}} { + set hier_sep [get_hierarchy_separator] set matching_pins {} foreach pin [get_pins -of_objects $cell] { set name [get_property NAME $pin] - if {[regexp $name_match $name]} { + set name_prefix [regex_escape "${cell}${hier_sep}"] + set pattern "${name_prefix}${name_match}" + if {[regexp $pattern $name]} { lappend matching_pins $pin } } @@ -95,15 +157,31 @@ proc find_cell_pins {cell name_match {should_exist 1}} { return $matching_pins } -proc get_cell_pin {cell name_match} { - foreach pin [get_pins -of_objects $cell] { - set name [get_property NAME $pin] - if {$name == $name_match} { - return $pin - } +proc get_cell_pin {cell name} { + set pin [get_pins -of_objects $cell -filter "NAME == $name"] + if {[llength $pin] == 0} { + puts "ERROR: No matching pin found for '$cell' matching '$name'." + exit -1 } - puts "ERROR: No matching pin found for '$cell' matching '$name_match'." - exit -1 + return $pin +} + +proc remove_cell_from_netlist {cell} { + variable debug + + puts "INFO: Removing cell '$cell' from the netlist." + + # Disconnect all pins of the cell + #foreach pin [get_pins -quiet -of_objects $cell] { + # foreach net [get_nets -quiet -of_objects $pin] { + # disconnect_net -net $net -objects $pin + # if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."} + # } + #} + + # Remove the cell + remove_cell $cell + if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."} } proc replace_pin_source {pin source_pin} { @@ -141,10 +219,42 @@ proc replace_pin_source {pin source_pin} { if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."} } -proc create_register_next {reg_cell prefix_name} { +proc find_net_driver {input_net {should_exist 1}} { + set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}] + if {[llength $driverPins] == 0} { + set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}] + if {[llength $driverPorts] == 0} { + print_error "No driver found for '$input_net'." $should_exist + } elseif {[llength $driverPorts] > 1} { + puts "WARNING: Multiple driver ports found for '$input_net'." + return [lindex $driverPorts 0] + } + return $driverPorts + } elseif {[llength $driverPins] > 1} { + puts "WARNING: Multiple driver pins found for '$input_net'." + return [lindex $driverPins 0] + } + return $driverPins +} + +proc find_pin_driver {input_pin {should_exist 1}} { + set net [get_nets -quiet -of_objects $input_pin] + if {[llength $net] == 0} { + print_error "No net connected to pin '$input_pin'." $should_exist + return "" + } elseif {[llength $net] > 1} { + puts "ERROR: Multiple nets connected to pin '$input_pin'." + exit -1 + } + return [find_net_driver $net] +} + +proc create_register_next {parent reg_cell} { variable debug - set reg_d_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/D"}] + set hier_sep [get_hierarchy_separator] + + set reg_d_pin [get_pins "${reg_cell}${hier_sep}D"] if {[llength $reg_d_pin] == 0} { puts "ERROR: No D pin found on register cell '$reg_cell'." exit -1 @@ -167,7 +277,7 @@ proc create_register_next {reg_cell prefix_name} { set register_type [get_property REF_NAME $reg_cell] if {$register_type == "FDRE"} { - set reg_r_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/R"}] + set reg_r_pin [get_pins "${reg_cell}${hier_sep}R"] if {[llength $reg_r_pin] == 0} { puts "ERROR: No R pin found on FDRE cell '$reg_cell'." exit -1 @@ -184,7 +294,7 @@ proc create_register_next {reg_cell prefix_name} { exit -1 } } elseif {$register_type == "FDSE"} { - set reg_s_pin [get_pins -of_objects $reg_cell -filter {NAME =~ "*/S"}] + set reg_s_pin [get_pins "${reg_cell}${hier_sep}S"] if {[llength $reg_s_pin] == 0} { puts "ERROR: No S pin found on FDSE cell '$reg_cell'." exit -1 @@ -229,7 +339,7 @@ proc create_register_next {reg_cell prefix_name} { # Use a 2x1 LUT to describe the logic: # FDRE: O = I1 ? 0 : I0; where I0=D, I1=R # FDSE: O = I1 ? 1 : I0; where I0=D, I1=S - set lut_name [unique_cell_name $prefix_name] + set lut_name [unique_cell_name "${parent}${hier_sep}raddr_next"] set lut_cell [create_cell -reference LUT2 $lut_name] puts "INFO: Created lut cell: '$lut_cell'" @@ -242,7 +352,7 @@ proc create_register_next {reg_cell prefix_name} { exit 1 } - set lut_i0_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/I0"}] + set lut_i0_pin [get_pins "${lut_cell}${hier_sep}I0"] if {[llength $lut_i0_pin] == 0} { puts "ERROR: No I0 pin found on FDSE cell '$lut_cell'." exit -1 @@ -251,7 +361,7 @@ proc create_register_next {reg_cell prefix_name} { exit -1 } - set lut_i1_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/I1"}] + set lut_i1_pin [get_pins "${lut_cell}${hier_sep}I1"] if {[llength $lut_i1_pin] == 0} { puts "ERROR: No I1 pin found on FDSE cell '$lut_cell'." exit -1 @@ -260,7 +370,7 @@ proc create_register_next {reg_cell prefix_name} { exit -1 } - set lut_o_pin [get_pins -of_objects $lut_cell -filter {NAME =~ "*/O"}] + set lut_o_pin [get_pins "${lut_cell}${hier_sep}O"] if {[llength $lut_o_pin] == 0} { puts "ERROR: No O pin found on FDSE cell '$lut_cell'." exit -1 @@ -278,19 +388,22 @@ proc create_register_next {reg_cell prefix_name} { return $lut_o_pin } -proc getOrCreateVCCPin {prefix_name} { +proc getOrCreateVCCPin {parent} { variable debug - set vcc_cell "" - set vcc_cells [get_cells -quiet -filter {REF_NAME == VCC}] - if {[llength $vcc_cells] == 0} { - set cell_name [unique_cell_name $prefix_name] + set hier_sep [get_hierarchy_separator] + set cell_name "${parent}${hier_sep}VCC" + + set vcc_cell [get_cells -quiet $cell_name] + if {[llength $vcc_cell] == 0} { set vcc_cell [create_cell -reference VCC $cell_name] puts "INFO: Created VCC cell: '$vcc_cell'" - } else { - set vcc_cell [lindex $vcc_cells 0] + } elseif {[llength $vcc_cell] > 1} { + puts "ERROR: Multiple VCC cells found with name '$cell_name'." + exit -1 } - set vcc_pin [get_pins -of_objects $vcc_cell -filter {NAME =~ "*/P"}] + + set vcc_pin [get_pins "${vcc_cell}${hier_sep}P"] if {[llength $vcc_pin] == 0} { puts "ERROR: No VCC pin found on VCC cell '$vcc_cell'." exit -1 @@ -298,22 +411,26 @@ proc getOrCreateVCCPin {prefix_name} { puts "ERROR: Multiple VCC pins found on VCC cell '$vcc_cell'." exit -1 } + return $vcc_pin } -proc getOrCreateGNDPin {prefix_name} { +proc getOrCreateGNDPin {parent} { variable debug - set gnd_cell "" - set gnd_cells [get_cells -quiet -filter {REF_NAME == GND}] - if {[llength $gnd_cells] == 0} { - set cell_name [unique_cell_name $prefix_name] + set hier_sep [get_hierarchy_separator] + set cell_name "${parent}${hier_sep}GND" + + set gnd_cell [get_cells -quiet $cell_name] + if {[llength $gnd_cell] == 0} { set gnd_cell [create_cell -reference GND $cell_name] puts "INFO: Created GND cell: '$gnd_cell'" - } else { - set gnd_cell [lindex $gnd_cells 0] + } elseif {[llength $gnd_cell] > 1} { + puts "ERROR: Multiple GND cells found with name '$cell_name'." + exit -1 } - set gnd_pin [get_pins -of_objects $gnd_cell -filter {NAME =~ "*/G"}] + + set gnd_pin [get_pins "${gnd_cell}${hier_sep}G"] if {[llength $gnd_pin] == 0} { puts "ERROR: No GND pin found on GND cell '$gnd_cell'." exit -1 @@ -321,6 +438,7 @@ proc getOrCreateGNDPin {prefix_name} { puts "ERROR: Multiple GND pins found on GND cell '$gnd_cell'." exit -1 } + return $gnd_pin } @@ -338,35 +456,6 @@ proc find_net_sinks {input_net {should_exist 1}} { return $sink_pins } -proc find_net_driver {input_net {should_exist 1}} { - set driverPins [get_pins -quiet -leaf -of_objects $input_net -filter {DIRECTION == "OUT"}] - if {[llength $driverPins] == 0} { - set driverPorts [get_ports -quiet -of_objects $input_net -filter {DIRECTION == "IN"}] - if {[llength $driverPorts] == 0} { - print_error "No driver found for '$input_net'." $should_exist - } elseif {[llength $driverPorts] > 1} { - puts "WARNING: Multiple driver ports found for '$input_net'." - return [lindex $driverPorts 0] - } - return $driverPorts - } elseif {[llength $driverPins] > 1} { - puts "WARNING: Multiple driver pins found for '$input_net'." - return [lindex $driverPins 0] - } - return $driverPins -} - -proc find_pin_driver {input_pin {should_exist 1}} { - set net [get_nets -quiet -of_objects $input_pin] - if {[llength $net] == 0} { - print_error "No net connected to pin '$input_pin'." $should_exist - } elseif {[llength $net] > 1} { - puts "ERROR: Multiple nets connected to pin '$input_pin'." - exit -1 - } - return [find_net_driver $net] -} - proc find_matching_nets {cell nets match repl} { set matching_nets {} foreach net $nets { @@ -386,6 +475,25 @@ proc find_matching_nets {cell nets match repl} { return $matching_nets } +proc find_matching_pins {cell pins match repl} { + set matching_pins {} + foreach pin $pins { + set pin_name [str_replace $pin $match $repl] + set matching_pin [get_cell_pin $cell $pin_name] + if {$matching_pin != ""} { + lappend matching_pins $matching_pin + } + } + if {[llength $matching_pins] == 0} { + puts "ERROR: No matching pins found for '$pins'." + exit -1 + } elseif {[llength $matching_pins] != [llength $pins]} { + puts "ERROR: Mismatch in number of matching pins." + exit -1 + } + return $matching_pins +} + proc replace_net_source {net source_pin} { foreach pin [find_net_sinks $net 0] { replace_pin_source $pin $source_pin @@ -397,6 +505,8 @@ proc resolve_async_bram {inst} { puts "INFO: Resolving asynchronous BRAM patch: '$inst'." + set hier_sep [get_hierarchy_separator] + set raddr_w_nets [find_cell_nets $inst "raddr_w(\\\[\\d+\\\])?$"] set read_s_net [find_cell_nets $inst "read_s$"] set is_raddr_reg_net [find_cell_nets $inst "is_raddr_reg$"] @@ -433,7 +543,7 @@ proc resolve_async_bram {inst} { } # Create register next cell and return output pin - set reg_next_pin [create_register_next $raddr_src_cell "$inst/raddr_next"] + set reg_next_pin [create_register_next $inst $raddr_src_cell] if {$reg_next_pin == ""} { puts "ERROR: failed to create register next value for '$raddr_src_cell'." exit -1 @@ -444,7 +554,7 @@ proc resolve_async_bram {inst} { # Find the CE pin on raddr_src_cell if {$reg_ce_src_pin == ""} { - set reg_ce_pin [get_pins -of_objects $raddr_src_cell -filter {NAME =~ "*/CE"}] + set reg_ce_pin [get_pins "${raddr_src_cell}${hier_sep}CE"] if {[llength $reg_ce_pin] == 0} { puts "ERROR: No CE pin found on register cell '$raddr_src_cell'." exit -1 @@ -466,9 +576,10 @@ proc resolve_async_bram {inst} { # do we have a fully registered read address? if {[llength $reg_next_pins] == [llength $raddr_w_nets]} { puts "INFO: Fully registered read address detected." + + # Connect all reg_next_pins to all input pins attached to raddr_s_nets set addr_width [llength $raddr_w_nets] for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} { - set raddr_w_net [lindex $raddr_w_nets $addr_idx] set raddr_s_net [lindex $raddr_s_nets $addr_idx] set reg_next_pin [lindex $reg_next_pins $addr_idx] puts "INFO: Connecting pin '$reg_next_pin' to '$raddr_s_net's pins." @@ -481,26 +592,35 @@ proc resolve_async_bram {inst} { replace_net_source $read_s_net $reg_ce_src_pin # Create Const<1>'s pin - set vcc_pin [getOrCreateVCCPin "$inst/VCC"] + set vcc_pin [getOrCreateVCCPin $inst] # Connect vcc_pin to all input pins attached to is_raddr_reg_net puts "INFO: Connecting pin '$vcc_pin' to '$is_raddr_reg_net's pins." replace_net_source $is_raddr_reg_net $vcc_pin + + # Remove all async_ram cells + foreach cell [find_nested_cells $inst "g_async_ram.*" 0] { + remove_cell_from_netlist $cell + } } else { puts "WARNING: Not all read addresses are registered!" # Create Const<0>'s pin - set gnd_pin [getOrCreateGNDPin "$inst/GND"] + set gnd_pin [getOrCreateGNDPin $inst] # Connect gnd_pin to all input pins attached to is_raddr_reg_net puts "INFO: Connecting pin '$gnd_pin' to '$is_raddr_reg_net's pins." replace_net_source $is_raddr_reg_net $gnd_pin + + # Remove all sync_ram cells + foreach cell [find_nested_cells $inst "g_sync_ram.*" 0] { + remove_cell_from_netlist $cell + } } - # Remove all placeholder cells + # Remove placeholder cell foreach cell [find_nested_cells $inst "placeholder$"] { - remove_cell $cell - if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."} + remove_cell_from_netlist $cell } } @@ -519,7 +639,26 @@ proc resolve_async_brams {} { } } +proc dump_async_bram_cells {} { + set bram_patch_cells [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] + if {[llength $bram_patch_cells] != 0} { + foreach cell $bram_patch_cells { + puts "INFO: Found async BRAM patch cell: '$cell'." + set child_cells [find_cell_descendants $cell] + foreach child $child_cells { + set type [get_property REF_NAME $child] + puts "INFO: child cell: '$child', type: '$type'" + } + } + } else { + puts "INFO: No async BRAM patch cells found in the design." + } +} + } # Invoke the procedure to resolve async BRAM vortex::resolve_async_brams + +# dump async bram cells +#vortex::dump_async_bram_cells diff --git a/hw/scripts/xilinx_export_netlist.tcl b/hw/scripts/xilinx_export_netlist.tcl index 25a0d17e8..a6ff22ff5 100644 --- a/hw/scripts/xilinx_export_netlist.tcl +++ b/hw/scripts/xilinx_export_netlist.tcl @@ -1,3 +1,16 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # Function to export netlist to a Graphviz DOT file proc export_netlist {dot_file_name} { # Open the DOT file for writing diff --git a/hw/syn/xilinx/README b/hw/syn/xilinx/README index 0fb83e71b..a1ca231fe 100644 --- a/hw/syn/xilinx/README +++ b/hw/syn/xilinx/README @@ -47,6 +47,9 @@ TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make chipscope # analyze build report vitis_analyzer build_xilinx_u50_gen3x16_xdma_5_202210_1_hw_4c/bin/vortex_afu.xclbin.link_summary +# resuming build for routing +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.route_design" make > build.log 2>&1 & + # running test FPGA_BIN_DIR= TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo FPGA_BIN_DIR= TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index 643724069..288031e2e 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -180,6 +180,7 @@ ifeq ($(TARGET), hw) cp $(BUILD_DIR)/_x/logs/link/vivado.log $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/reports/link/syn/ulp_vortex_afu_1_0_synth_1_ulp_vortex_afu_1_0_utilization_synth.rpt $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_utilization_placed.rpt $(BUILD_DIR)/bin cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt $(BUILD_DIR)/bin endif diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp index 7ba7f9471..b2d9a8db3 100644 --- a/runtime/rtlsim/vortex.cpp +++ b/runtime/rtlsim/vortex.cpp @@ -78,10 +78,10 @@ class vx_device { _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; case VX_CAPS_NUM_MEM_BANKS: - _value = MEMORY_BANKS; + _value = PLATFORM_MEMORY_BANKS; break; case VX_CAPS_MEM_BANK_SIZE: - _value = 1ull << (MEM_ADDR_WIDTH / MEMORY_BANKS); + _value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS); break; default: std::cout << "invalid caps id: " << caps_id << std::endl; diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 8e4351e0a..b8b9ce24f 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -65,7 +65,7 @@ class vx_device { ~vx_device() { #ifdef VM_ENABLE global_mem_.release(PAGE_TABLE_BASE_ADDR); - // for (auto i = addr_mapping.begin(); i != addr_mapping.end(); i++) + // for (auto i = addr_mapping.begin(); i != addr_mapping.end(); i++) // page_table_mem_->release(i->second << MEM_PAGE_SIZE); delete virtual_mem_; delete page_table_mem_; @@ -113,10 +113,10 @@ class vx_device { _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; case VX_CAPS_NUM_MEM_BANKS: - _value = MEMORY_BANKS; + _value = PLATFORM_MEMORY_BANKS; break; case VX_CAPS_MEM_BANK_SIZE: - _value = 1ull << (MEM_ADDR_WIDTH / MEMORY_BANKS); + _value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_BANKS); break; default: std::cout << "invalid caps id: " << caps_id << std::endl; @@ -164,7 +164,7 @@ class vx_device { if ((STARTUP_ADDR <= dev_pAddr) && (dev_pAddr <= (STARTUP_ADDR + 0x40000))) return 0; - // Now all conditions are not met. Return true because the address needs translation + // Now all conditions are not met. Return true because the address needs translation return 1; } @@ -277,7 +277,7 @@ class vx_device { #ifdef VM_ENABLE uint64_t pAddr = page_table_walk(dest_addr); // uint64_t pAddr; - // try { + // try { // pAddr = page_table_walk(dest_addr); // } catch ( Page_Fault_Exception ) { // // HW: place holder @@ -466,18 +466,18 @@ class vx_device { CHECK_ERR(virtual_mem_reserve(STARTUP_ADDR, 0x40000, VX_MEM_READ_WRITE), { return err; }); - + if (virtual_mem_ == nullptr) { // virtual_mem_ does not intefere with physical mem, so no need to free space - + return 1; } - + if (VM_ADDR_MODE == BARE) DBGPRINT("[RT:init_VM] VA_MODE = BARE MODE(addr= 0x0)"); else CHECK_ERR(alloc_page_table(&pt_addr),{return err;}); - + CHECK_ERR(processor_.set_satp_by_addr(pt_addr),{return err;}); return 0; } @@ -604,7 +604,7 @@ class vx_device { } else { - // Leaf node found. + // Leaf node found. // Check RWX permissions according to access type. if (pte.r == 0) { diff --git a/sim/common/simobject.h b/sim/common/simobject.h index 31fc4c0e6..e6e6e42da 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,9 +27,9 @@ class SimObjectBase; /////////////////////////////////////////////////////////////////////////////// class SimPortBase { -public: +public: virtual ~SimPortBase() {} - + SimObjectBase* module() const { return module_; } @@ -92,7 +92,7 @@ class SimPort : public SimPortBase { auto cycles = queue_.front().cycles; queue_.pop(); return cycles; - } + } void tx_callback(const TxCallback& callback) { tx_cb_ = callback; @@ -137,7 +137,7 @@ class SimEventBase { typedef std::shared_ptr Ptr; virtual ~SimEventBase() {} - + virtual void fire() const = 0; uint64_t cycles() const { @@ -161,7 +161,7 @@ class SimCallEvent : public SimEventBase { typedef std::function Func; - SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles) + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles) : SimEventBase(cycles) , func_(func) , pkt_(pkt) @@ -194,8 +194,8 @@ class SimPortEvent : public SimEventBase { const_cast*>(port_)->transfer(pkt_, cycles_); } - SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t cycles) - : SimEventBase(cycles) + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t cycles) + : SimEventBase(cycles) , port_(port) , pkt_(pkt) {} @@ -209,7 +209,7 @@ class SimPortEvent : public SimEventBase { } protected: - const SimPort* port_; + const SimPort* port_; Pkt pkt_; static MemoryPool> allocator_; @@ -230,11 +230,11 @@ class SimObjectBase { const std::string& name() const { return name_; - } + } protected: - SimObjectBase(const SimContext& ctx, const char* name); + SimObjectBase(const SimContext& ctx, const std::string& name); private: @@ -259,8 +259,8 @@ class SimObject : public SimObjectBase { protected: - SimObject(const SimContext& ctx, const char* name) - : SimObjectBase(ctx, name) + SimObject(const SimContext& ctx, const std::string& name) + : SimObjectBase(ctx, name) {} private: @@ -283,9 +283,9 @@ class SimObject : public SimObjectBase { }; class SimContext { -private: +private: SimContext() {} - + friend class SimPlatform; }; @@ -320,10 +320,10 @@ class SimPlatform { template void schedule(const typename SimCallEvent::Func& callback, - const Pkt& pkt, - uint64_t delay) { + const Pkt& pkt, + uint64_t delay) { assert(delay != 0); - auto evt = std::make_shared>(callback, pkt, cycles_ + delay); + auto evt = std::make_shared>(callback, pkt, cycles_ + delay); events_.emplace_back(evt); } @@ -341,10 +341,10 @@ class SimPlatform { auto evt_it_end = events_.end(); while (evt_it != evt_it_end) { auto& event = *evt_it; - if (cycles_ >= event->cycles()) { + if (cycles_ >= event->cycles()) { event->fire(); evt_it = events_.erase(evt_it); - } else { + } else { ++evt_it; } } @@ -352,7 +352,7 @@ class SimPlatform { for (auto& object : objects_) { object->do_tick(); } - // advance clock + // advance clock ++cycles_; } @@ -390,8 +390,8 @@ class SimPlatform { /////////////////////////////////////////////////////////////////////////////// -inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) - : name_(name) +inline SimObjectBase::SimObjectBase(const SimContext&, const std::string& name) + : name_(name) {} template @@ -403,8 +403,8 @@ typename SimObject::Ptr SimObject::Create(Args&&... args) { template void SimPort::push(const Pkt& pkt, uint64_t delay) const { if (peer_ && !tx_cb_) { - reinterpret_cast*>(peer_)->push(pkt, delay); + reinterpret_cast*>(peer_)->push(pkt, delay); } else { SimPlatform::instance().schedule(this, pkt, delay); - } + } } diff --git a/sim/common/stringutil.h b/sim/common/stringutil.h index cddb5c3a3..ce3607c98 100644 --- a/sim/common/stringutil.h +++ b/sim/common/stringutil.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -47,7 +47,7 @@ class IndentStream : public std::streambuf { , indent_(indent, ' ') , owner_(nullptr) {} - + explicit IndentStream(std::ostream& dest, int indent = 4) : dest_(dest.rdbuf()) , isBeginLine_(true) @@ -76,3 +76,14 @@ class IndentStream : public std::streambuf { std::string indent_; std::ostream* owner_; }; + +template +std::string StrFormat(const std::string& fmt, Args... args) { + auto size = std::snprintf(nullptr, 0, fmt.c_str(), args...) + 1; + if (size <= 0) { + throw std::runtime_error("Error during formatting."); + } + std::vector buf(size); + std::snprintf(buf.data(), size, fmt.c_str(), args...); + return std::string(buf.data(), buf.data() + size - 1); +} \ No newline at end of file diff --git a/sim/simx/cache_cluster.h b/sim/simx/cache_cluster.h index 2ba26dc21..8c69c7e63 100644 --- a/sim/simx/cache_cluster.h +++ b/sim/simx/cache_cluster.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,81 +21,77 @@ class CacheCluster : public SimObject { public: std::vector>> CoreReqPorts; std::vector>> CoreRspPorts; - SimPort MemReqPort; - SimPort MemRspPort; - - CacheCluster(const SimContext& ctx, - const char* name, - uint32_t num_inputs, - uint32_t num_caches, - uint32_t num_requests, - const CacheSim::Config& cache_config) + std::vector> MemReqPorts; + std::vector> MemRspPorts; + + CacheCluster(const SimContext& ctx, + const char* name, + uint32_t num_inputs, + uint32_t num_units, + const CacheSim::Config& cache_config) : SimObject(ctx, name) - , CoreReqPorts(num_inputs, std::vector>(num_requests, this)) - , CoreRspPorts(num_inputs, std::vector>(num_requests, this)) - , MemReqPort(this) - , MemRspPort(this) - , caches_(MAX(num_caches, 0x1)) { + , CoreReqPorts(num_inputs, std::vector>(cache_config.num_inputs, this)) + , CoreRspPorts(num_inputs, std::vector>(cache_config.num_inputs, this)) + , MemReqPorts(cache_config.mem_ports, this) + , MemRspPorts(cache_config.mem_ports, this) + , caches_(MAX(num_units, 0x1)) { CacheSim::Config cache_config2(cache_config); - if (0 == num_caches) { - num_caches = 1; + if (0 == num_units) { + num_units = 1; cache_config2.bypass = true; } char sname[100]; - - std::vector input_arbs(num_inputs); - for (uint32_t j = 0; j < num_inputs; ++j) { - snprintf(sname, 100, "%s-input-arb%d", name, j); - input_arbs.at(j) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, cache_config.num_inputs); - for (uint32_t i = 0; i < num_requests; ++i) { - this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(j)->ReqIn.at(i)); - input_arbs.at(j)->RspIn.at(i).bind(&this->CoreRspPorts.at(j).at(i)); - } - } - std::vector mem_arbs(cache_config.num_inputs); + // Arbitrate incoming core interfaces + std::vector input_arbs(cache_config.num_inputs); for (uint32_t i = 0; i < cache_config.num_inputs; ++i) { - snprintf(sname, 100, "%s-mem-arb%d", name, i); - mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_inputs, num_caches); + snprintf(sname, 100, "%s-input-arb%d", name, i); + input_arbs.at(i) = MemArbiter::Create(sname, ArbiterType::RoundRobin, num_inputs, num_units); for (uint32_t j = 0; j < num_inputs; ++j) { - input_arbs.at(j)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(j)); - mem_arbs.at(i)->RspIn.at(j).bind(&input_arbs.at(j)->RspOut.at(i)); + this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(i)->ReqIn.at(j)); + input_arbs.at(i)->RspIn.at(j).bind(&this->CoreRspPorts.at(j).at(i)); } } - snprintf(sname, 100, "%s-cache-arb", name); - auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1); + // Arbitrate outgoing memory interfaces + std::vector mem_arbs(cache_config.mem_ports); + for (uint32_t i = 0; i < cache_config.mem_ports; ++i) { + snprintf(sname, 100, "%s-mem-arb%d", name, i); + mem_arbs.at(i) = MemArbiter::Create(sname, ArbiterType::RoundRobin, num_units, 1); + mem_arbs.at(i)->ReqOut.at(0).bind(&this->MemReqPorts.at(i)); + this->MemRspPorts.at(i).bind(&mem_arbs.at(i)->RspOut.at(0)); + } - for (uint32_t i = 0; i < num_caches; ++i) { + // Connect caches + for (uint32_t i = 0; i < num_units; ++i) { snprintf(sname, 100, "%s-cache%d", name, i); caches_.at(i) = CacheSim::Create(sname, cache_config2); for (uint32_t j = 0; j < cache_config.num_inputs; ++j) { - mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j)); - caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i)); + input_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j)); + caches_.at(i)->CoreRspPorts.at(j).bind(&input_arbs.at(j)->RspOut.at(i)); } - caches_.at(i)->MemReqPorts.at(0).bind(&cache_arb->ReqIn.at(i)); - cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPorts.at(0)); + for (uint32_t j = 0; j < cache_config.mem_ports; ++j) { + caches_.at(i)->MemReqPorts.at(j).bind(&mem_arbs.at(j)->ReqIn.at(i)); + mem_arbs.at(j)->RspIn.at(i).bind(&caches_.at(i)->MemRspPorts.at(j)); + } } - - cache_arb->ReqOut.at(0).bind(&this->MemReqPort); - this->MemRspPort.bind(&cache_arb->RspOut.at(0)); } ~CacheCluster() {} void reset() {} - + void tick() {} CacheSim::PerfStats perf_stats() const { CacheSim::PerfStats perf; for (auto cache : caches_) { perf += cache->perf_stats(); - } + } return perf; } diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index 27a73ba72..02997277f 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -19,7 +19,6 @@ #include #include #include -#include using namespace vortex; @@ -305,8 +304,8 @@ class CacheSim::Impl { Config config_; params_t params_; std::vector banks_; - MemSwitch::Ptr bank_switch_; - MemSwitch::Ptr bypass_switch_; + MemArbiter::Ptr bank_arb_; + std::vector nc_arbs_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; std::vector pipeline_reqs_; @@ -322,88 +321,51 @@ class CacheSim::Impl { , config_(config) , params_(config) , banks_((1 << config.B), {config, params_}) + , nc_arbs_(config.mem_ports) , mem_req_ports_((1 << config.B), simobject) , mem_rsp_ports_((1 << config.B), simobject) , pipeline_reqs_((1 << config.B), config.ports_per_bank) { char sname[100]; - snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); if (config_.bypass) { - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); + snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); + auto bypass_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, config_.num_inputs, config_.mem_ports); for (uint32_t i = 0; i < config_.num_inputs; ++i) { - simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); - bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); + simobject->CoreReqPorts.at(i).bind(&bypass_arb->ReqIn.at(i)); + bypass_arb->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); + } + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + bypass_arb->ReqOut.at(i).bind(&simobject->MemReqPorts.at(i)); + simobject->MemRspPorts.at(i).bind(&bypass_arb->RspOut.at(i)); } - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); - simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); return; } - if (strcmp(simobject->name().c_str(), "l3cache")) { - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); - simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); - - if (config.B != 0) { - snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); - for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { - mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); - bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); - } - bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); - } else { - mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); - } - } else { - // TODO: Change this into a crossbar - uint32_t max = MAX(2, config_.num_inputs); - //printf("%s connecting\n", simobject_->name().c_str()); - //3 - if (config.B != 0) { - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, max, max); - for (uint32_t i = 0; i < max; ++i) { - //printf("%s connecting input=%d to MemPorts\n", simobject_->name().c_str(), i); - bypass_switch_->ReqOut.at(i).bind(&simobject->MemReqPorts.at(i % (1 << config.B))); - simobject->MemRspPorts.at(i % (1 << config.B)).bind(&bypass_switch_->RspOut.at(i)); - } - } else { - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPorts.at(0)); - simobject->MemRspPorts.at(0).bind(&bypass_switch_->RspOut.at(0)); - } + // create non-cacheable arbiter + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + snprintf(sname, 100, "%s-nc-arb%d", simobject->name().c_str(), i); + nc_arbs_.at(i) = MemArbiter::Create(sname, ArbiterType::Priority, 2, 1); + } - if (config.B != 0) - { - snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B), (1 << config.B)); - for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) - { - //1 - //printf("%s Connecting memory ports to bank=%d\n", simobject_->name().c_str(), i); - mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); - bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); - } - //2 - if (config_.num_inputs > 1) { - for (uint32_t i = 0; i < max; ++i) { - //printf("%s connecting bank and bypass port=%d\n", simobject_->name().c_str(), i); - bank_switch_->ReqOut.at(i % (1 << config.B)).bind(&bypass_switch_->ReqIn.at(i)); - bypass_switch_->RspIn.at(i).bind(&bank_switch_->RspOut.at(i % (1 << config.B))); - } - } else { - bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); - } - } - else - { - mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); - } + // Connect non-cacheable arbiter output to outgoing memory ports + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + nc_arbs_.at(i)->ReqOut.at(0).bind(&simobject->MemReqPorts.at(i)); + simobject->MemRspPorts.at(i).bind(&nc_arbs_.at(i)->RspOut.at(0)); + } + + // Create bank's memory arbiter + snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); + auto bank_mem_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, (1 << config.B), config_.mem_ports); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { + mem_req_ports_.at(i).bind(&bank_mem_arb->ReqIn.at(i)); + bank_mem_arb->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + } + + // Connect bank's memory arbiter to non-cacheable arbiter's input 0 + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + bank_mem_arb->ReqOut.at(i).bind(&nc_arbs_.at(i)->ReqIn.at(0)); + nc_arbs_.at(i)->RspIn.at(0).bind(&bank_mem_arb->RspOut.at(i)); } // calculate cache initialization cycles @@ -434,8 +396,8 @@ class CacheSim::Impl { } // handle cache bypasss responses - { - auto& bypass_port = bypass_switch_->RspIn.at(1); + for (uint32_t i = 0, n = config_.mem_ports; i < n; ++i) { + auto& bypass_port = nc_arbs_.at(i)->RspIn.at(1); if (!bypass_port.empty()) { auto& mem_rsp = bypass_port.front(); this->processBypassResponse(mem_rsp); @@ -468,7 +430,7 @@ class CacheSim::Impl { continue; auto& mem_rsp = mem_rsp_port.front(); - DT(3, simobject_->name() << "-bank" << bank_id << " fill-rsp: " << mem_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-fill-rsp: " << mem_rsp); pipeline_req.type = bank_req_t::Fill; pipeline_req.tag = mem_rsp.tag; mem_rsp_port.pop(); @@ -533,7 +495,7 @@ class CacheSim::Impl { bank_req.type = bank_req_t::Core; bank_req.write = core_req.write; pipeline_req = bank_req; - DT(3, simobject_->name() << " core-req: " << core_req); + DT(3, simobject_->name() << "-core-req: " << core_req); } if (core_req.write) @@ -561,21 +523,22 @@ class CacheSim::Impl { uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid}; simobject_->CoreRspPorts.at(req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << " bypass-core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bypass-core-rsp: " << core_rsp); } void processBypassRequest(const MemReq& core_req, uint32_t req_id) { { MemReq mem_req(core_req); mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; - bypass_switch_->ReqIn.at(1).push(mem_req, 1); - DT(3, simobject_->name() << " bypass-dram-req: " << mem_req); + uint32_t mem_port = req_id % config_.mem_ports; + nc_arbs_.at(mem_port)->ReqIn.at(1).push(mem_req, 1); + DT(3, simobject_->name() << "-bypass-dram-req: " << mem_req); } if (core_req.write && config_.write_reponse) { MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid}; simobject_->CoreRspPorts.at(req_id).push(core_rsp, 1); - DT(3, simobject_->name() << " bypass-core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bypass-core-rsp: " << core_rsp); } } @@ -605,7 +568,7 @@ class CacheSim::Impl { continue; MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << "-bank" << bank_id << " replay: " << core_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-replay: " << core_rsp); } } } break; @@ -649,7 +612,7 @@ class CacheSim::Impl { mem_req.cid = pipeline_req.cid; mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " writethrough: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-writethrough: " << mem_req); } else { // mark line as dirty hit_line.dirty = true; @@ -662,7 +625,7 @@ class CacheSim::Impl { continue; MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << "-bank" << bank_id << " core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-core-rsp: " << core_rsp); } } } else { @@ -681,7 +644,7 @@ class CacheSim::Impl { mem_req.write = true; mem_req.cid = pipeline_req.cid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " writeback: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-writeback: " << mem_req); ++perf_stats_.evictions; } } @@ -695,7 +658,7 @@ class CacheSim::Impl { mem_req.cid = pipeline_req.cid; mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " writethrough: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-writethrough: " << mem_req); } // send core response if (config_.write_reponse) { @@ -704,7 +667,7 @@ class CacheSim::Impl { continue; MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << "-bank" << bank_id << " core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-core-rsp: " << core_rsp); } } } else { @@ -713,7 +676,7 @@ class CacheSim::Impl { // allocate MSHR auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id); - DT(3, simobject_->name() << "-bank" << bank_id << " mshr-enqueue: " << pipeline_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-mshr-enqueue: " << pipeline_req); // send fill request if (!mshr_pending) { @@ -724,7 +687,7 @@ class CacheSim::Impl { mem_req.cid = pipeline_req.cid; mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " fill: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-fill: " << mem_req); ++pending_fill_reqs_; } } @@ -743,8 +706,8 @@ CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) - , MemReqPorts(NUM_MEM_PORTS, this) - , MemRspPorts(NUM_MEM_PORTS, this) + , MemReqPorts(config.mem_ports, this) + , MemRspPorts(config.mem_ports, this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/cache_sim.h b/sim/simx/cache_sim.h index aad489546..1e586fed7 100644 --- a/sim/simx/cache_sim.h +++ b/sim/simx/cache_sim.h @@ -30,6 +30,7 @@ class CacheSim : public SimObject { uint8_t addr_width; // word address bits uint8_t ports_per_bank; // number of ports per bank uint8_t num_inputs; // number of inputs + uint8_t mem_ports; // memory ports bool write_back; // is write-back bool write_reponse; // enable write response uint16_t mshr_size; // MSHR buffer size diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index 56e05e7a5..ebcaa3e39 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -20,9 +20,9 @@ Cluster::Cluster(const SimContext& ctx, ProcessorImpl* processor, const Arch &arch, const DCRS &dcrs) - : SimObject(ctx, "cluster") - , mem_req_port(this) - , mem_rsp_port(this) + : SimObject(ctx, StrFormat("cluster%d", cluster_id)) + , mem_req_ports(L2_MEM_PORTS, this) + , mem_rsp_ports(L2_MEM_PORTS, this) , cluster_id_(cluster_id) , processor_(processor) , sockets_(NUM_SOCKETS) @@ -35,31 +35,14 @@ Cluster::Cluster(const SimContext& ctx, // create sockets - snprintf(sname, 100, "cluster%d-icache-arb", cluster_id); - auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); - - snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id); - auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); - for (uint32_t i = 0; i < sockets_per_cluster; ++i) { uint32_t socket_id = cluster_id * sockets_per_cluster + i; - auto socket = Socket::Create(socket_id, - this, - arch, - dcrs); - - socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i)); - icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port); - - socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i)); - dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port); - - sockets_.at(i) = socket; + sockets_.at(i) = Socket::Create(socket_id, this, arch, dcrs); } // Create l2cache - snprintf(sname, 100, "cluster%d-l2cache", cluster_id); + snprintf(sname, 100, "%s-l2cache", this->name().c_str()); l2cache_ = CacheSim::Create(sname, CacheSim::Config{ !L2_ENABLED, log2ceil(L2_CACHE_SIZE),// C @@ -69,21 +52,27 @@ Cluster::Cluster(const SimContext& ctx, log2ceil(L2_NUM_BANKS), // B XLEN, // address bits 1, // number of ports - 2, // request size + L2_NUM_REQS, // request size + L2_MEM_PORTS, // memory ports L2_WRITEBACK, // write-back false, // write response L2_MSHR_SIZE, // mshr size 2, // pipeline latency }); - l2cache_->MemReqPorts.at(0).bind(&this->mem_req_port); - this->mem_rsp_port.bind(&l2cache_->MemRspPorts.at(0)); - - icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0)); - l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0)); + // connect l2cache core interfaces + for (uint32_t i = 0; i < sockets_per_cluster; ++i) { + for (uint32_t j = 0; j < L1_MEM_PORTS; ++j) { + sockets_.at(i)->mem_req_ports.at(j).bind(&l2cache_->CoreReqPorts.at(i * L1_MEM_PORTS + j)); + l2cache_->CoreRspPorts.at(i * L1_MEM_PORTS + j).bind(&sockets_.at(i)->mem_rsp_ports.at(j)); + } + } - dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1)); - l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0)); + // connect l2cache memory interfaces + for (uint32_t i = 0; i < L2_MEM_PORTS; ++i) { + l2cache_->MemReqPorts.at(i).bind(&this->mem_req_ports.at(i)); + this->mem_rsp_ports.at(i).bind(&l2cache_->MemRspPorts.at(i)); + } } Cluster::~Cluster() { diff --git a/sim/simx/cluster.h b/sim/simx/cluster.h index df96031c3..d31aa1672 100644 --- a/sim/simx/cluster.h +++ b/sim/simx/cluster.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,13 +32,13 @@ class Cluster : public SimObject { CacheSim::PerfStats l2cache; }; - SimPort mem_req_port; - SimPort mem_rsp_port; + std::vector> mem_req_ports; + std::vector> mem_rsp_ports; - Cluster(const SimContext& ctx, + Cluster(const SimContext& ctx, uint32_t cluster_id, - ProcessorImpl* processor, - const Arch &arch, + ProcessorImpl* processor, + const Arch &arch, const DCRS &dcrs); ~Cluster(); @@ -63,16 +63,16 @@ class Cluster : public SimObject { bool running() const; - int get_exitcode() const; + int get_exitcode() const; void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); PerfStats perf_stats() const; - + private: uint32_t cluster_id_; ProcessorImpl* processor_; - std::vector sockets_; + std::vector sockets_; std::vector barriers_; CacheSim::Ptr l2cache_; uint32_t cores_per_socket_; diff --git a/sim/simx/constants.h b/sim/simx/constants.h index c651bbfc4..6a79722ae 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -27,10 +27,15 @@ inline constexpr int LSU_WORD_SIZE = (XLEN / 8); inline constexpr int LSU_CHANNELS = NUM_LSU_LANES; inline constexpr int LSU_NUM_REQS = (NUM_LSU_BLOCKS * LSU_CHANNELS); +// The dcache uses coalesced memory blocks inline constexpr int DCACHE_WORD_SIZE = LSU_LINE_SIZE; inline constexpr int DCACHE_CHANNELS = UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE); -inline constexpr int DCACHE_NUM_REQS = (NUM_LSU_BLOCKS * DCACHE_CHANNELS); +inline constexpr int DCACHE_NUM_REQS = (NUM_LSU_BLOCKS * DCACHE_CHANNELS); inline constexpr int NUM_SOCKETS = UP(NUM_CORES / SOCKET_SIZE); +inline constexpr int L2_NUM_REQS = NUM_SOCKETS * L1_MEM_PORTS; + +inline constexpr int L3_NUM_REQS = NUM_CLUSTERS * L2_MEM_PORTS; + inline constexpr int PER_ISSUE_WARPS = NUM_WARPS / ISSUE_WIDTH; \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 537230a80..5e5b9cf3a 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, Socket* socket, const Arch &arch, const DCRS &dcrs) - : SimObject(ctx, "core") + : SimObject(ctx, StrFormat("core%d", core_id)) , icache_req_ports(1, this) , icache_rsp_ports(1, this) , dcache_req_ports(DCACHE_NUM_REQS, this) @@ -44,7 +44,7 @@ Core::Core(const SimContext& ctx, , operands_(ISSUE_WIDTH) , dispatchers_((uint32_t)FUType::Count) , func_units_((uint32_t)FUType::Count) - , lsu_demux_(NUM_LSU_BLOCKS) + , lmem_switch_(NUM_LSU_BLOCKS) , mem_coalescers_(NUM_LSU_BLOCKS) , lsu_dcache_adapter_(NUM_LSU_BLOCKS) , lsu_lmem_adapter_(NUM_LSU_BLOCKS) @@ -59,12 +59,12 @@ Core::Core(const SimContext& ctx, // create the memory coalescer for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-coalescer%d", core_id, i); + snprintf(sname, 100, "%s-coalescer%d", this->name().c_str(), i); mem_coalescers_.at(i) = MemCoalescer::Create(sname, LSU_CHANNELS, DCACHE_CHANNELS, DCACHE_WORD_SIZE, LSUQ_OUT_SIZE, 1); } // create local memory - snprintf(sname, 100, "core%d-local_mem", core_id); + snprintf(sname, 100, "%s-local_mem", this->name().c_str()); local_mem_ = LocalMem::Create(sname, LocalMem::Config{ (1 << LMEM_LOG_SIZE), LSU_WORD_SIZE, @@ -73,31 +73,31 @@ Core::Core(const SimContext& ctx, false }); - // create lsu demux + // create lmem switch for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i); - lsu_demux_.at(i) = LocalMemDemux::Create(sname, 1); + snprintf(sname, 100, "%s-lmem_switch%d", this->name().c_str(), i); + lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1); } // create lsu dcache adapter for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-lsu_dcache_adapter%d", core_id, i); + snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i); lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1); } // create lsu lmem adapter for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-lsu_lmem_adapter%d", core_id, i); + snprintf(sname, 100, "%s-lsu_lmem_adapter%d", this->name().c_str(), i); lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1); } // connect lsu demux for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { - lsu_demux_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn); - mem_coalescers_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspDC); + lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn); + mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC); - lsu_demux_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn); - lsu_lmem_adapter_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspLmem); + lmem_switch_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn); + lsu_lmem_adapter_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspLmem); } // connect coalescer-adapter @@ -130,7 +130,7 @@ Core::Core(const SimContext& ctx, dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES); dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES); dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES); - + // initialize execute units func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object(this); @@ -140,8 +140,8 @@ Core::Core(const SimContext& ctx, // bind commit arbiters for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - snprintf(sname, 100, "core%d-commit-arb%d", core_id, i); - auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1); + snprintf(sname, 100, "%s-commit-arb%d", this->name().c_str(), i); + auto arbiter = TraceArbiter::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1); for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) { func_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j)); } diff --git a/sim/simx/core.h b/sim/simx/core.h index e538350dd..a058e9a10 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -34,7 +34,7 @@ class Socket; class Arch; class DCRS; -using TraceSwitch = Mux; +using TraceArbiter = Arbiter; class Core : public SimObject { public: @@ -154,7 +154,7 @@ class Core : public SimObject { std::vector dispatchers_; std::vector func_units_; LocalMem::Ptr local_mem_; - std::vector lsu_demux_; + std::vector lmem_switch_; std::vector mem_coalescers_; std::vector lsu_dcache_adapter_; std::vector lsu_lmem_adapter_; @@ -169,7 +169,7 @@ class Core : public SimObject { PerfStats perf_stats_; - std::vector commit_arbs_; + std::vector commit_arbs_; uint32_t commit_exe_; uint32_t ibuffer_idx_; diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index dd8253571..42b10fce2 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -1421,7 +1421,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { std::abort(); } } break; - case Opcode::TCU: + case Opcode::TCU: { //TODO - make it data-type flexible uint32_t mem_bytes = 1; DP(3, "mem_bytes=" << mem_bytes << std::endl); @@ -1443,7 +1443,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { //LOAD if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp) - { + { num_threads_actv = tc_size*tc_size*n_tiles*TC_per_warp; num_data_per_thread = 1; } @@ -1456,7 +1456,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { //STORE if(num_threads > tc_size*tc_size*TC_per_warp) - { + { num_threads_actv_st = tc_size*tc_size*TC_per_warp; num_data_per_thread_st = 1; } @@ -1466,30 +1466,30 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { num_data_per_thread_st = (tc_size*tc_size)/num_threads_per_tc; } data_bytes_store = mem_bytes*num_data_per_thread_st; - + DP(3, "Num Tiles=" << n_tiles << std::endl); - + switch (func3) { - case 0: - { //Matrix Load + case 0: + { //Matrix Load DP (4, "TCU LOAD"); trace->fu_type = FUType::LSU; trace->lsu_type = LsuType::TCU_LOAD; - + trace->src_regs[0] = {RegType::Integer, rsrc0}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - - for (uint32_t t = thread_start; t < num_threads_actv; ++t) + + for (uint32_t t = thread_start; t < num_threads_actv; ++t) { if (!warp.tmask.test(t)) continue; - DP(3, "Thread ID" << t); + DP(3, "Thread ID" << t); uint32_t base_addr = rsdata[t][0].i ; trace_data->mem_addrs.at(t) = {base_addr, data_bytes_load}; - + //Load A or B (depends on immsrc) int loop_offset = 0; DP(3, "n_tiles = " << n_tiles << "; num_data_per_thread = " << num_data_per_thread <fu_type = FUType::LSU; trace->lsu_type = LsuType::TCU_STORE; @@ -1513,12 +1513,12 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) + for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) { if (!warp.tmask.test(t)) continue; - DP(3, "Thread ID" << t); + DP(3, "Thread ID" << t); uint32_t base_addr = rsdata[t][0].i ; trace_data->mem_addrs.at(t) = {base_addr, data_bytes_store}; @@ -1529,7 +1529,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0)); *temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n]; - this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes); + this->dcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes); } } //Clear the scratchpad @@ -1539,18 +1539,18 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } } break; - case 2: + case 2: { //Matrix Multiply DP(4, "TCU MULTIPLY MAT"); trace->fu_type = FUType::TCU; trace->tcu_type = TCUType::TCU_MUL; uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp); - for (uint32_t t = thread_start; t < num_threads_actv; ++t) + for (uint32_t t = thread_start; t < num_threads_actv; ++t) { if (!warp.tmask.test(t)) continue; - - DP(3, "Thread ID" << t); + + DP(3, "Thread ID" << t); //TC operation [only 1 thread in 1 warp needs to do this] if (t%threads_per_tc == 0) { @@ -1563,7 +1563,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size; uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation? - { + { for (int i = 0; i < tc_size; i++) { //ROW-1 for (int j = 0; j < tc_size; j++) { //COL-2 int sum = 0; diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index a182f6d8b..d33a0ac1c 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -116,12 +116,12 @@ void LsuUnit::tick() { // handle memory responses for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { - auto& lsu_rsp_port = core_->lsu_demux_.at(b)->RspIn; + auto& lsu_rsp_port = core_->lmem_switch_.at(b)->RspIn; if (lsu_rsp_port.empty()) continue; auto& state = states_.at(b); auto& lsu_rsp = lsu_rsp_port.front(); - DT(3, this->name() << " mem-rsp: " << lsu_rsp); + DT(3, this->name() << "-mem-rsp: " << lsu_rsp); auto& entry = state.pending_rd_reqs.at(lsu_rsp.tag); auto trace = entry.trace; assert(!entry.mask.none()); @@ -146,7 +146,7 @@ void LsuUnit::tick() { continue; Outputs.at(iw).push(state.fence_trace, 1); state.fence_lock = false; - DT(3, this->name() << " fence-unlock: " << state.fence_trace); + DT(3, this->name() << "-fence-unlock: " << state.fence_trace); } // check input queue @@ -160,7 +160,7 @@ void LsuUnit::tick() { // schedule fence lock state.fence_trace = trace; state.fence_lock = true; - DT(3, this->name() << " fence-lock: " << *trace); + DT(3, this->name() << "-fence-lock: " << *trace); // remove input input.pop(); continue; @@ -171,7 +171,7 @@ void LsuUnit::tick() { // check pending queue capacity if (!is_write && state.pending_rd_reqs.full()) { if (!trace->log_once(true)) { - DT(4, "*** " << this->name() << " queue-full: " << *trace); + DT(4, "*** " << this->name() << "-queue-full: " << *trace); } continue; } else { @@ -201,8 +201,8 @@ void LsuUnit::tick() { lsu_req.uuid = trace->uuid; // send memory request - core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req); - DT(3, this->name() << " mem-req: " << lsu_req); + core_->lmem_switch_.at(block_idx)->ReqIn.push(lsu_req); + DT(3, this->name() << "-mem-req: " << lsu_req); // update stats auto num_addrs = lsu_req.mask.count(); @@ -237,7 +237,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { { req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4); } - + auto t0 = trace->pid * NUM_LSU_LANES; for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { @@ -246,11 +246,11 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { continue; int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS); - auto& dcache_req_port = core_->lsu_demux_.at(req_idx)->ReqIn; + auto& dcache_req_port = core_->lmem_switch_.at(req_idx)->ReqIn; auto mem_addr = trace_data->mem_addrs.at(t); auto type = get_addr_type(mem_addr.addr); - // DT(3, "addr_type = " << type << ", " << *trace); + // DT(3, "addr_type = " << type << ", " << *trace); uint32_t mem_bytes = 1; for (int i = 0; i < req_per_thread; i++) { @@ -261,7 +261,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { mem_req.tag = tag; mem_req.cid = trace->cid; mem_req.uuid = trace->uuid; - + dcache_req_port.push(mem_req, 1); DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag << ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace); @@ -272,7 +272,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { ++core_->perf_stats_.loads; ++pending_loads_; } - + ++count; } } @@ -282,7 +282,7 @@ int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { /////////////////////////////////////////////////////////////////////////////// -TcuUnit::TcuUnit(const SimContext& ctx, Core* core) +TcuUnit::TcuUnit(const SimContext& ctx, Core* core) : FuncUnit(ctx, core, "TCU") {} @@ -290,7 +290,7 @@ void TcuUnit::tick() { for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { auto& input = Inputs.at(i); - if (input.empty()) + if (input.empty()) continue; auto& output = Outputs.at(i); auto trace = input.front(); @@ -307,7 +307,7 @@ void TcuUnit::tick() { } default: std::abort(); - } + } DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace); input.pop(); } diff --git a/sim/simx/local_mem.cpp b/sim/simx/local_mem.cpp index 1bab3fccb..99654aecc 100644 --- a/sim/simx/local_mem.cpp +++ b/sim/simx/local_mem.cpp @@ -24,9 +24,8 @@ class LocalMem::Impl { LocalMem* simobject_; Config config_; RAM ram_; - int32_t bank_sel_addr_start_; - int32_t bank_sel_addr_end_; - PerfStats perf_stats_; + MemCrossBar::Ptr mem_xbar_; + mutable PerfStats perf_stats_; uint64_t to_local_addr(uint64_t addr) { uint32_t total_lines = config_.capacity / config_.line_size; @@ -40,9 +39,15 @@ class LocalMem::Impl { : simobject_(simobject) , config_(config) , ram_(config.capacity) - , bank_sel_addr_start_(0) - , bank_sel_addr_end_(config.B-1) - {} + { + char sname[100]; + snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); + mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_reqs, (1 << config.B)); + for (uint32_t i = 0; i < config.num_reqs; ++i) { + simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i)); + mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i)); + } + } virtual ~Impl() {} @@ -63,45 +68,33 @@ class LocalMem::Impl { } void tick() { - std::vector in_used_banks(1 << config_.B); - for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { - auto& core_req_port = simobject_->Inputs.at(req_id); - if (core_req_port.empty()) + // process bank requets from xbar + uint32_t num_banks = (1 << config_.B); + for (uint32_t i = 0; i < num_banks; ++i) { + auto& xbar_req_out = mem_xbar_->ReqOut.at(i); + if (xbar_req_out.empty()) continue; - auto& core_req = core_req_port.front(); - - uint32_t bank_id = 0; - if (bank_sel_addr_end_ >= bank_sel_addr_start_) { - bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_); - } - - // bank conflict check - if (in_used_banks.at(bank_id)) { - ++perf_stats_.bank_stalls; - continue; - } - - DT(4, simobject_->name() << " mem-req" << req_id << ": "<< core_req); - - in_used_banks.at(bank_id) = true; + auto& bank_req = xbar_req_out.front(); + DT(4, simobject_->name() << "-bank" << i << "-req : " << bank_req); - if (!core_req.write || config_.write_reponse) { - // send response - MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid}; - simobject_->Outputs.at(req_id).push(core_rsp, 1); + if (!bank_req.write || config_.write_reponse) { + // send xbar response + MemRsp bank_rsp{bank_req.tag, bank_req.cid, bank_req.uuid}; + mem_xbar_->RspOut.at(i).push(bank_rsp, 1); } // update perf counters - perf_stats_.reads += !core_req.write; - perf_stats_.writes += core_req.write; + perf_stats_.reads += !bank_req.write; + perf_stats_.writes += bank_req.write; // remove input - core_req_port.pop(); + xbar_req_out.pop(); } } const PerfStats& perf_stats() const { + perf_stats_.bank_stalls = mem_xbar_->collisions(); return perf_stats_; } }; diff --git a/sim/simx/mem_coalescer.cpp b/sim/simx/mem_coalescer.cpp index 8af567985..073fb5aeb 100644 --- a/sim/simx/mem_coalescer.cpp +++ b/sim/simx/mem_coalescer.cpp @@ -42,10 +42,10 @@ void MemCoalescer::reset() { } void MemCoalescer::tick() { - // process incoming responses + // process outgoing responses if (!RspOut.empty()) { auto& out_rsp = RspOut.front(); - DT(4, this->name() << " mem-rsp: " << out_rsp); + DT(4, this->name() << "-mem-rsp: " << out_rsp); auto& entry = pending_rd_reqs_.at(out_rsp.tag); BitVector<> rsp_mask(input_size_); @@ -89,7 +89,7 @@ void MemCoalescer::tick() { // ensure we can allocate a response tag if (pending_rd_reqs_.full()) { - DT(4, "*** " << this->name() << " queue-full: " << in_req); + DT(4, "*** " << this->name() << "-queue-full: " << in_req); return; } @@ -145,7 +145,7 @@ void MemCoalescer::tick() { // send memory request ReqOut.push(out_req, delay_); - DT(4, this->name() << " mem-req: coalesced=" << cur_mask.count() << ", " << out_req); + DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req); // update sent mask sent_mask_ |= cur_mask; diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index 37ea3bb88..7cfcb3945 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -27,13 +27,14 @@ class MemSim::Impl { private: MemSim* simobject_; Config config_; + MemCrossBar::Ptr mem_xbar_; DramSim dram_sim_; PerfStats perf_stats_; struct DramCallbackArgs { - MemSim* simobject; - MemReq request; - uint32_t i; + MemSim::Impl* memsim; + MemReq request; + uint32_t bank_id; }; public: @@ -41,7 +42,15 @@ class MemSim::Impl { : simobject_(simobject) , config_(config) , dram_sim_(MEM_CLOCK_RATIO) - {} + { + char sname[100]; + snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); + mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks); + for (uint32_t i = 0; i < config.num_ports; ++i) { + simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i)); + mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i)); + } + } ~Impl() { //-- @@ -59,14 +68,14 @@ class MemSim::Impl { dram_sim_.tick(); uint32_t counter = 0; - for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { - if (simobject_->MemReqPorts.at(i).empty()) + for (uint32_t i = 0; i < config_.num_banks; ++i) { + if (mem_xbar_->ReqOut.at(i).empty()) continue; - auto& mem_req = simobject_->MemReqPorts.at(i).front(); + auto& mem_req = mem_xbar_->ReqOut.at(i).front(); // try to enqueue the request to the memory system - auto req_args = new DramCallbackArgs{simobject_, mem_req, i}; + auto req_args = new DramCallbackArgs{this, mem_req, i}; auto enqueue_success = dram_sim_.send_request( mem_req.write, mem_req.addr, @@ -76,8 +85,8 @@ class MemSim::Impl { // only send a response for read requests if (!rsp_args->request.write) { MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; - rsp_args->simobject->MemRspPorts.at(rsp_args->i).push(mem_rsp, 1); - DT(3, rsp_args->simobject->name() << " mem-rsp: bank=" << rsp_args->i << ", " << mem_rsp); + rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1); + DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp: bank=" << rsp_args->bank_id << ", " << mem_rsp); } delete rsp_args; }, @@ -90,9 +99,9 @@ class MemSim::Impl { continue; } - DT(3, simobject_->name() << " mem-req: bank=" << i << ", " << mem_req); + DT(3, simobject_->name() << "-mem-req: bank=" << i << ", " << mem_req); - simobject_->MemReqPorts.at(i).pop(); + mem_xbar_->ReqOut.at(i).pop(); counter++; } @@ -107,8 +116,8 @@ class MemSim::Impl { MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) - , MemReqPorts(NUM_MEM_PORTS, this) - , MemRspPorts(NUM_MEM_PORTS, this) + , MemReqPorts(config.num_ports, this) + , MemRspPorts(config.num_ports, this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/mem_sim.h b/sim/simx/mem_sim.h index 2f4f96187..220d1eb4f 100644 --- a/sim/simx/mem_sim.h +++ b/sim/simx/mem_sim.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,15 +21,15 @@ namespace vortex { class MemSim : public SimObject{ public: struct Config { - uint32_t channels; - uint32_t num_cores; + uint32_t num_banks; + uint32_t num_ports; }; struct PerfStats { uint64_t counter; uint64_t ticks; - PerfStats() + PerfStats() : counter(0) , ticks(0) {} @@ -52,7 +52,7 @@ class MemSim : public SimObject{ void tick(); const PerfStats& perf_stats() const; - + private: class Impl; Impl* impl_; diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 20caf2b49..3a54e463c 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -24,10 +24,15 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) // create memory simulator memsim_ = MemSim::Create("dram", MemSim::Config{ - MEMORY_BANKS, - uint32_t(arch.num_cores()) * arch.num_clusters() + PLATFORM_MEMORY_BANKS, + L3_MEM_PORTS }); + // create clusters + for (uint32_t i = 0; i < arch.num_clusters(); ++i) { + clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_); + } + // create L3 cache l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{ !L3_ENABLED, @@ -38,7 +43,8 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) log2ceil(L3_NUM_BANKS), // B XLEN, // address bits 1, // number of ports - uint8_t(arch.num_clusters()), // request size + L3_NUM_REQS, // request size + L3_MEM_PORTS, // memory ports L3_WRITEBACK, // write-back false, // write response L3_MSHR_SIZE, // mshr size @@ -46,26 +52,26 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) } ); - // connect L3 memory ports - for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { - l3cache_->MemReqPorts.at(i).bind(&memsim_->MemReqPorts.at(i)); - memsim_->MemRspPorts.at(i).bind(&l3cache_->MemRspPorts.at(i)); + // connect L3 core interfaces + for (uint32_t i = 0; i < arch.num_clusters(); ++i) { + for (uint32_t j = 0; j < L2_MEM_PORTS; ++j) { + clusters_.at(i)->mem_req_ports.at(j).bind(&l3cache_->CoreReqPorts.at(i * L2_MEM_PORTS + j)); + l3cache_->CoreRspPorts.at(i * L2_MEM_PORTS + j).bind(&clusters_.at(i)->mem_rsp_ports.at(j)); + } } - // create clusters - for (uint32_t i = 0; i < arch.num_clusters(); ++i) { - clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_); - // connect L3 core ports - clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i)); - l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port); + // connect L3 memory interfaces + for (uint32_t i = 0; i < L3_MEM_PORTS; ++i) { + l3cache_->MemReqPorts.at(i).bind(&memsim_->MemReqPorts.at(i)); + memsim_->MemRspPorts.at(i).bind(&l3cache_->MemRspPorts.at(i)); } // set up memory profiling - for (uint32_t i = 0; i < NUM_MEM_PORTS; ++i) { + for (uint32_t i = 0; i < L3_MEM_PORTS; ++i) { memsim_->MemReqPorts.at(i).tx_callback([&](const MemReq& req, uint64_t cycle){ __unused (cycle); - perf_mem_reads_ += !req.write; - perf_mem_writes_ += req.write; + perf_mem_reads_ += !req.write; + perf_mem_writes_ += req.write; perf_mem_pending_reads_ += !req.write; }); memsim_->MemRspPorts.at(i).tx_callback([&](const MemRsp&, uint64_t cycle){ diff --git a/sim/simx/socket.cpp b/sim/simx/socket.cpp index cef8a3908..0e70e4ce2 100644 --- a/sim/simx/socket.cpp +++ b/sim/simx/socket.cpp @@ -21,11 +21,9 @@ Socket::Socket(const SimContext& ctx, Cluster* cluster, const Arch &arch, const DCRS &dcrs) - : SimObject(ctx, "socket") - , icache_mem_req_port(this) - , icache_mem_rsp_port(this) - , dcache_mem_req_port(this) - , dcache_mem_rsp_port(this) + : SimObject(ctx, StrFormat("socket%d", socket_id)) + , mem_req_ports(L1_MEM_PORTS, this) + , mem_rsp_ports(L1_MEM_PORTS, this) , socket_id_(socket_id) , cluster_(cluster) , cores_(arch.socket_size()) @@ -33,8 +31,8 @@ Socket::Socket(const SimContext& ctx, auto cores_per_socket = cores_.size(); char sname[100]; - snprintf(sname, 100, "socket%d-icaches", socket_id); - icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{ + snprintf(sname, 100, "%s-icaches", this->name().c_str()); + icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, CacheSim::Config{ !ICACHE_ENABLED, log2ceil(ICACHE_SIZE), // C log2ceil(L1_LINE_SIZE), // L @@ -44,17 +42,15 @@ Socket::Socket(const SimContext& ctx, XLEN, // address bits 1, // number of ports 1, // number of inputs + 1, // memory ports false, // write-back false, // write response (uint8_t)arch.num_warps(), // mshr size 2, // pipeline latency }); - icaches_->MemReqPort.bind(&icache_mem_req_port); - icache_mem_rsp_port.bind(&icaches_->MemRspPort); - - snprintf(sname, 100, "socket%d-dcaches", socket_id); - dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, DCACHE_NUM_REQS, CacheSim::Config{ + snprintf(sname, 100, "%s-dcaches", this->name().c_str()); + dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, CacheSim::Config{ !DCACHE_ENABLED, log2ceil(DCACHE_SIZE), // C log2ceil(L1_LINE_SIZE), // L @@ -64,21 +60,41 @@ Socket::Socket(const SimContext& ctx, XLEN, // address bits 1, // number of ports DCACHE_NUM_REQS, // number of inputs + L1_MEM_PORTS, // memory ports DCACHE_WRITEBACK, // write-back false, // write response DCACHE_MSHR_SIZE, // mshr size 2, // pipeline latency }); - dcaches_->MemReqPort.bind(&dcache_mem_req_port); - dcache_mem_rsp_port.bind(&dcaches_->MemRspPort); + // connect l1 caches to outgoing memory interfaces + for (uint32_t i = 0; i < L1_MEM_PORTS; ++i) { + if (i == 0) { + snprintf(sname, 100, "%s-l1_arb%d", this->name().c_str(), i); + auto l1_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, 2, 1); - // create cores + icaches_->MemReqPorts.at(0).bind(&l1_arb->ReqIn.at(1)); + l1_arb->RspIn.at(1).bind(&icaches_->MemRspPorts.at(0)); + + dcaches_->MemReqPorts.at(0).bind(&l1_arb->ReqIn.at(0)); + l1_arb->RspIn.at(0).bind(&dcaches_->MemRspPorts.at(0)); + l1_arb->ReqOut.at(0).bind(&this->mem_req_ports.at(0)); + this->mem_rsp_ports.at(0).bind(&l1_arb->RspOut.at(0)); + } else { + dcaches_->MemReqPorts.at(i).bind(&this->mem_req_ports.at(i)); + this->mem_rsp_ports.at(i).bind(&dcaches_->MemRspPorts.at(i)); + } + } + + // create cores for (uint32_t i = 0; i < cores_per_socket; ++i) { uint32_t core_id = socket_id * cores_per_socket + i; cores_.at(i) = Core::Create(core_id, this, arch, dcrs); + } + // connect cores to caches + for (uint32_t i = 0; i < cores_per_socket; ++i) { cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); diff --git a/sim/simx/socket.h b/sim/simx/socket.h index 104d53292..f8c266d05 100644 --- a/sim/simx/socket.h +++ b/sim/simx/socket.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,16 +32,13 @@ class Socket : public SimObject { CacheSim::PerfStats dcache; }; - SimPort icache_mem_req_port; - SimPort icache_mem_rsp_port; + std::vector> mem_req_ports; + std::vector> mem_rsp_ports; - SimPort dcache_mem_req_port; - SimPort dcache_mem_rsp_port; - - Socket(const SimContext& ctx, + Socket(const SimContext& ctx, uint32_t socket_id, - Cluster* cluster, - const Arch &arch, + Cluster* cluster, + const Arch &arch, const DCRS &dcrs); ~Socket(); @@ -66,14 +63,14 @@ class Socket : public SimObject { bool running() const; - int get_exitcode() const; + int get_exitcode() const; void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); void resume(uint32_t core_id); PerfStats perf_stats() const; - + private: uint32_t socket_id_; Cluster* cluster_; diff --git a/sim/simx/types.cpp b/sim/simx/types.cpp index 3e6c5960f..56bf60cea 100644 --- a/sim/simx/types.cpp +++ b/sim/simx/types.cpp @@ -15,11 +15,11 @@ using namespace vortex; -LocalMemDemux::LocalMemDemux( +LocalMemSwitch::LocalMemSwitch( const SimContext& ctx, const char* name, uint32_t delay -) : SimObject(ctx, name) +) : SimObject(ctx, name) , ReqIn(this) , RspIn(this) , ReqLmem(this) @@ -29,19 +29,19 @@ LocalMemDemux::LocalMemDemux( , delay_(delay) {} -void LocalMemDemux::reset() {} +void LocalMemSwitch::reset() {} -void LocalMemDemux::tick() { - // process incoming responses +void LocalMemSwitch::tick() { + // process outgoing responses if (!RspLmem.empty()) { auto& out_rsp = RspLmem.front(); - DT(4, this->name() << " lmem-rsp: " << out_rsp); + DT(4, this->name() << "-lmem-rsp: " << out_rsp); RspIn.push(out_rsp, 1); RspLmem.pop(); } if (!RspDC.empty()) { auto& out_rsp = RspDC.front(); - DT(4, this->name() << " dc-rsp: " << out_rsp); + DT(4, this->name() << "-dc-rsp: " << out_rsp); RspIn.push(out_rsp, 1); RspDC.pop(); } @@ -73,12 +73,12 @@ void LocalMemDemux::tick() { if (!out_dc_req.mask.none()) { ReqDC.push(out_dc_req, delay_); - DT(4, this->name() << " dc-req: " << out_dc_req); + DT(4, this->name() << "-dc-req: " << out_dc_req); } if (!out_lmem_req.mask.none()) { ReqLmem.push(out_lmem_req, delay_); - DT(4, this->name() << " lmem-req: " << out_lmem_req); + DT(4, this->name() << "-lmem-req: " << out_lmem_req); } ReqIn.pop(); } @@ -104,12 +104,12 @@ void LsuMemAdapter::reset() {} void LsuMemAdapter::tick() { uint32_t input_size = ReqOut.size(); - // process incoming responses + // process outgoing responses for (uint32_t i = 0; i < input_size; ++i) { if (RspOut.at(i).empty()) continue; auto& out_rsp = RspOut.at(i).front(); - DT(4, this->name() << " rsp" << i << ": " << out_rsp); + DT(4, this->name() << "-rsp" << i << ": " << out_rsp); // build memory response LsuRsp in_rsp(input_size); @@ -141,7 +141,6 @@ void LsuMemAdapter::tick() { if (!ReqIn.empty()) { auto& in_req = ReqIn.front(); assert(in_req.mask.size() == input_size); - for (uint32_t i = 0; i < input_size; ++i) { if (in_req.mask.test(i)) { // build memory request @@ -152,10 +151,9 @@ void LsuMemAdapter::tick() { out_req.tag = in_req.tag; out_req.cid = in_req.cid; out_req.uuid = in_req.uuid; - // send memory request ReqOut.at(i).push(out_req, delay_); - DT(4, this->name() << " req" << i << ": " << out_req); + DT(4, this->name() << "-req" << i << ": " << out_req); } } ReqIn.pop(); diff --git a/sim/simx/types.h b/sim/simx/types.h index 77b351150..76232bbe4 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -466,29 +466,29 @@ class HashTable { /////////////////////////////////////////////////////////////////////////////// template -class Mux : public SimObject> { +class Arbiter : public SimObject> { public: std::vector> Inputs; std::vector> Outputs; - Mux( + Arbiter( const SimContext& ctx, const char* name, ArbiterType type, uint32_t num_inputs, uint32_t num_outputs = 1, uint32_t delay = 1 - ) : SimObject>(ctx, name) + ) : SimObject>(ctx, name) , Inputs(num_inputs, this) , Outputs(num_outputs, this) , type_(type) , delay_(delay) - , cursors_(num_outputs, 0) - , num_reqs_(log2ceil(num_inputs / num_outputs)) + , grants_(num_outputs, 0) + , lg2_num_reqs_(log2ceil(num_inputs / num_outputs)) { assert(delay != 0); - assert(num_inputs <= 32); - assert(num_outputs <= 32); + assert(num_inputs <= 64); + assert(num_outputs <= 64); assert(num_inputs >= num_outputs); // bypass mode @@ -500,15 +500,15 @@ class Mux : public SimObject> { } void reset() { - for (auto& cursor : cursors_) { - cursor = 0; + for (auto& grant : grants_) { + grant = 0; } } void tick() { uint32_t I = Inputs.size(); uint32_t O = Outputs.size(); - uint32_t R = 1 << num_reqs_; + uint32_t R = 1 << lg2_num_reqs_; // skip bypass mode if (I == O) @@ -517,8 +517,8 @@ class Mux : public SimObject> { // process inputs for (uint32_t o = 0; o < O; ++o) { for (uint32_t r = 0; r < R; ++r) { - uint32_t i = (cursors_.at(o) + r) & (R-1); - uint32_t j = o * R + i; + uint32_t g = (grants_.at(o) + r) & (R-1); + uint32_t j = o * R + g; if (j >= I) continue; @@ -527,31 +527,134 @@ class Mux : public SimObject> { auto& req = req_in.front(); Outputs.at(o).push(req, delay_); req_in.pop(); - this->update_cursor(o, i); + this->update_grant(o, g); break; } } } } -private: +protected: + + void update_grant(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + grants_.at(index) = grant + 1; + } + } + + ArbiterType type_; + uint32_t delay_; + std::vector grants_; + uint32_t lg2_num_reqs_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class CrossBar : public SimObject> { +public: + std::vector> Inputs; + std::vector> Outputs; + + CrossBar( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t num_outputs = 1, + uint32_t addr_start = 0, + uint32_t delay = 1 + ) + : SimObject>(ctx, name) + , Inputs(num_inputs, this) + , Outputs(num_outputs, this) + , type_(type) + , delay_(delay) + , grants_(num_outputs, 0) + , lg2_inputs_(log2ceil(num_inputs)) + , lg2_outputs_(log2ceil(num_outputs)) + , addr_start_(addr_start) + , collisions_(0) { + assert(delay != 0); + assert(num_inputs <= 64); + assert(num_outputs <= 64); + assert(ispow2(num_outputs)); + } + + void reset() { + for (auto& grant : grants_) { + grant = 0; + } + } + + void tick() { + uint32_t I = Inputs.size(); + uint32_t O = Outputs.size(); + uint32_t R = 1 << lg2_inputs_; + + // process incoming requests + for (uint32_t o = 0; o < O; ++o) { + int32_t input_idx = -1; + for (uint32_t r = 0; r < R; ++r) { + uint32_t i = (grants_.at(o) + r) & (R-1); + if (i >= I) + continue; + auto& req_in = Inputs.at(i); + if (!req_in.empty()) { + auto& req = req_in.front(); + // skip if input is not going to current output + uint32_t output_idx = 0; + if (O != 1) { + output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1); + } + if (output_idx != o) + continue; + if (input_idx != -1) { + ++collisions_; + continue; + } + input_idx = i; + } + } + if (input_idx != -1) { + auto& req_in = Inputs.at(input_idx); + auto& req = req_in.front(); + if (lg2_inputs_ != 0) { + req.tag = (req.tag << lg2_inputs_) | input_idx; + } + DT(4, this->name() << "-req" << input_idx << ": " << req); + Outputs.at(o).push(req, delay_); + req_in.pop(); + this->update_grant(o, input_idx); + } + } + } - void update_cursor(uint32_t index, uint32_t grant) { + uint64_t collisions() const { + return collisions_; + } + +protected: + + void update_grant(uint32_t index, uint32_t grant) { if (type_ == ArbiterType::RoundRobin) { - cursors_.at(index) = grant + 1; + grants_.at(index) = grant + 1; } } ArbiterType type_; uint32_t delay_; - std::vector cursors_; - uint32_t num_reqs_; + std::vector grants_; + uint32_t lg2_inputs_; + uint32_t lg2_outputs_; + uint32_t addr_start_; + uint64_t collisions_; }; /////////////////////////////////////////////////////////////////////////////// template -class Switch : public SimObject> { +class TxArbiter : public SimObject> { public: std::vector> ReqIn; std::vector> RspIn; @@ -559,7 +662,7 @@ class Switch : public SimObject> { std::vector> ReqOut; std::vector> RspOut; - Switch( + TxArbiter( const SimContext& ctx, const char* name, ArbiterType type, @@ -567,19 +670,19 @@ class Switch : public SimObject> { uint32_t num_outputs = 1, uint32_t delay = 1 ) - : SimObject>(ctx, name) + : SimObject>(ctx, name) , ReqIn(num_inputs, this) , RspIn(num_inputs, this) , ReqOut(num_outputs, this) , RspOut(num_outputs, this) , type_(type) , delay_(delay) - , cursors_(num_outputs, 0) - , lg_num_reqs_(log2ceil(num_inputs / num_outputs)) + , grants_(num_outputs, 0) + , lg2_num_reqs_(log2ceil(num_inputs / num_outputs)) { assert(delay != 0); - assert(num_inputs <= 32); - assert(num_outputs <= 32); + assert(num_inputs <= 64); + assert(num_outputs <= 64); assert(num_inputs >= num_outputs); // bypass mode @@ -592,76 +695,238 @@ class Switch : public SimObject> { } void reset() { - for (auto& cursor : cursors_) { - cursor = 0; + for (auto& grant : grants_) { + grant = 0; } } void tick() { uint32_t I = ReqIn.size(); uint32_t O = ReqOut.size(); - uint32_t R = 1 << lg_num_reqs_; + uint32_t R = 1 << lg2_num_reqs_; // skip bypass mode if (I == O) return; + // process outgoing responses for (uint32_t o = 0; o < O; ++o) { - // process incoming responses - if (!RspOut.at(o).empty()) { - auto& rsp = RspOut.at(o).front(); - uint32_t i = 0; - if (lg_num_reqs_ != 0) { - i = rsp.tag & (R-1); - rsp.tag >>= lg_num_reqs_; + auto& rsp_out = RspOut.at(o); + if (!rsp_out.empty()) { + auto& rsp = rsp_out.front(); + uint32_t g = 0; + if (lg2_num_reqs_ != 0) { + g = rsp.tag & (R-1); + rsp.tag >>= lg2_num_reqs_; } - DT(4, this->name() << " rsp" << o << ": " << rsp); - uint32_t j = o * R + i; + DT(4, this->name() << "-rsp" << o << ": " << rsp); + uint32_t j = o * R + g; RspIn.at(j).push(rsp, 1); - RspOut.at(o).pop(); + rsp_out.pop(); } + } - // process incoming requests + // process incoming requests + for (uint32_t o = 0; o < O; ++o) { for (uint32_t r = 0; r < R; ++r) { - uint32_t i = (cursors_.at(o) + r) & (R-1); - uint32_t j = o * R + i; + uint32_t g = (grants_.at(o) + r) & (R-1); + uint32_t j = o * R + g; if (j >= I) continue; auto& req_in = ReqIn.at(j); if (!req_in.empty()) { auto& req = req_in.front(); - if (lg_num_reqs_ != 0) { - req.tag = (req.tag << lg_num_reqs_) | i; + if (lg2_num_reqs_ != 0) { + req.tag = (req.tag << lg2_num_reqs_) | g; } - DT(4, this->name() << " req" << j << ": " << req); + DT(4, this->name() << "-req" << j << ": " << req); ReqOut.at(o).push(req, delay_); req_in.pop(); - this->update_cursor(o, i); + this->update_grant(o, g); break; } } } } - void update_cursor(uint32_t index, uint32_t grant) { +protected: + + void update_grant(uint32_t index, uint32_t grant) { if (type_ == ArbiterType::RoundRobin) { - cursors_.at(index) = grant + 1; + grants_.at(index) = grant + 1; } } -private: ArbiterType type_; uint32_t delay_; - std::vector cursors_; - uint32_t lg_num_reqs_; + std::vector grants_; + uint32_t lg2_num_reqs_; }; -using MemSwitch = Switch; +/////////////////////////////////////////////////////////////////////////////// + +template +class TxCrossBar : public SimObject> { +public: + std::vector> ReqIn; + std::vector> RspIn; + + std::vector> ReqOut; + std::vector> RspOut; + + TxCrossBar( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t num_outputs = 1, + uint32_t addr_start = 0, + uint32_t delay = 1 + ) + : SimObject>(ctx, name) + , ReqIn(num_inputs, this) + , RspIn(num_inputs, this) + , ReqOut(num_outputs, this) + , RspOut(num_outputs, this) + , type_(type) + , delay_(delay) + , req_grants_(num_outputs, 0) + , rsp_grants_(num_inputs, 0) + , lg2_inputs_(log2ceil(num_inputs)) + , lg2_outputs_(log2ceil(num_outputs)) + , addr_start_(addr_start) + , collisions_(0) { + assert(delay != 0); + assert(num_inputs <= 64); + assert(num_outputs <= 64); + assert(ispow2(num_inputs)); + assert(ispow2(num_outputs)); + } + + void reset() { + for (auto& grant : req_grants_) { + grant = 0; + } + for (auto& grant : rsp_grants_) { + grant = 0; + } + } + + void tick() { + uint32_t I = ReqIn.size(); + uint32_t O = ReqOut.size(); + uint32_t R = 1 << lg2_inputs_; + uint32_t T = 1 << lg2_outputs_; + + // process outgoing responses + for (uint32_t i = 0; i < I; ++i) { + int32_t output_idx = -1; + for (uint32_t t = 0; t < T; ++t) { + uint32_t o = (rsp_grants_.at(i) + t) & (T-1); + if (o >= O) + continue; + auto& rsp_out = RspOut.at(o); + if (!rsp_out.empty()) { + auto& rsp = rsp_out.front(); + // skip if response is not going to current input + uint32_t input_idx = 0; + if (lg2_inputs_ != 0) { + input_idx = rsp.tag & (R-1); + } + if (input_idx != i) + continue; + if (output_idx != -1) { + ++collisions_; + continue; + } + output_idx = o; + } + } + if (output_idx != -1) { + auto& rsp_out = RspOut.at(output_idx); + auto& rsp = rsp_out.front(); + uint32_t input_idx = 0; + if (lg2_inputs_ != 0) { + input_idx = rsp.tag & (R-1); + rsp.tag >>= lg2_inputs_; + } + DT(4, this->name() << "-rsp" << output_idx << ": " << rsp); + RspIn.at(input_idx).push(rsp, 1); + rsp_out.pop(); + this->update_rsp_grant(i, output_idx); + } + } + + // process incoming requests + for (uint32_t o = 0; o < O; ++o) { + int32_t input_idx = -1; + for (uint32_t r = 0; r < R; ++r) { + uint32_t i = (req_grants_.at(o) + r) & (R-1); + if (i >= I) + continue; + auto& req_in = ReqIn.at(i); + if (!req_in.empty()) { + auto& req = req_in.front(); + // skip if request is not going to current output + uint32_t output_idx = 0; + if (O != 1) { + output_idx = (uint32_t)bit_getw(req.addr, addr_start_, lg2_outputs_-1); + } + if (output_idx != o) + continue; + if (input_idx != -1) { + ++collisions_; + continue; + } + input_idx = i; + } + } + if (input_idx != -1) { + auto& req_in = ReqIn.at(input_idx); + auto& req = req_in.front(); + if (lg2_inputs_ != 0) { + req.tag = (req.tag << lg2_inputs_) | input_idx; + } + DT(4, this->name() << "-req" << input_idx << ": " << req); + ReqOut.at(o).push(req, delay_); + req_in.pop(); + this->update_req_grant(o, input_idx); + } + } + } + + uint64_t collisions() const { + return collisions_; + } + +protected: + + void update_req_grant(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + req_grants_.at(index) = grant + 1; + } + } + + void update_rsp_grant(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + rsp_grants_.at(index) = grant + 1; + } + } + + ArbiterType type_; + uint32_t delay_; + std::vector req_grants_; + std::vector rsp_grants_; + uint32_t lg2_inputs_; + uint32_t lg2_outputs_; + uint32_t addr_start_; + uint64_t collisions_; +}; /////////////////////////////////////////////////////////////////////////////// -class LocalMemDemux : public SimObject { +class LocalMemSwitch : public SimObject { public: SimPort ReqIn; SimPort RspIn; @@ -672,7 +937,7 @@ class LocalMemDemux : public SimObject { SimPort ReqDC; SimPort RspDC; - LocalMemDemux( + LocalMemSwitch( const SimContext& ctx, const char* name, uint32_t delay @@ -711,4 +976,7 @@ class LsuMemAdapter : public SimObject { uint32_t delay_; }; +using MemArbiter = TxArbiter; +using MemCrossBar = TxCrossBar; + } diff --git a/sim/xrtsim/xrt_sim.cpp b/sim/xrtsim/xrt_sim.cpp index 8dd800931..b56cf2015 100644 --- a/sim/xrtsim/xrt_sim.cpp +++ b/sim/xrtsim/xrt_sim.cpp @@ -142,8 +142,8 @@ class xrt_sim::Impl { if (future_.valid()) { future_.wait(); } - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { - delete mem_alloc_[i]; + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { + delete mem_alloc_[b]; } if (ram_) { delete ram_; @@ -187,8 +187,8 @@ class xrt_sim::Impl { MP_M_AXI_MEM(PLATFORM_MEMORY_BANKS); // initialize memory allocator - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { - mem_alloc_[i] = new MemoryAllocator(0, mem_bank_size_, 4096, 64); + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { + mem_alloc_[b] = new MemoryAllocator(0, mem_bank_size_, 4096, 64); } // reset the device @@ -257,8 +257,9 @@ class xrt_sim::Impl { //printf("%0ld: [sim] register_write: address=0x%x\n", timestamp, offset); device_->s_axi_ctrl_awvalid = 1; device_->s_axi_ctrl_awaddr = offset; - while (!device_->s_axi_ctrl_awready) + while (!device_->s_axi_ctrl_awready) { this->tick(); + } this->tick(); device_->s_axi_ctrl_awvalid = 0; @@ -267,8 +268,9 @@ class xrt_sim::Impl { device_->s_axi_ctrl_wvalid = 1; device_->s_axi_ctrl_wdata = value; device_->s_axi_ctrl_wstrb = 0xf; - while (!device_->s_axi_ctrl_wready) + while (!device_->s_axi_ctrl_wready) { this->tick(); + } this->tick(); device_->s_axi_ctrl_wvalid = 0; @@ -290,8 +292,9 @@ class xrt_sim::Impl { //printf("%0ld: [sim] register_read: address=0x%x\n", timestamp, offset); device_->s_axi_ctrl_arvalid = 1; device_->s_axi_ctrl_araddr = offset; - while (!device_->s_axi_ctrl_arready) + while (!device_->s_axi_ctrl_arready) { this->tick(); + } this->tick(); device_->s_axi_ctrl_arvalid = 0; @@ -318,9 +321,9 @@ class xrt_sim::Impl { reqs.clear(); } - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { std::queue empty; - std::swap(dram_queues_[i], empty); + std::swap(dram_queues_[b], empty); } device_->ap_rst_n = 0; @@ -335,10 +338,10 @@ class xrt_sim::Impl { device_->ap_rst_n = 1; // this AXI device is always ready to accept new requests - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { - *m_axi_mem_[i].arready = 1; - *m_axi_mem_[i].awready = 1; - *m_axi_mem_[i].wready = 1; + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { + *m_axi_mem_[b].arready = 1; + *m_axi_mem_[b].awready = 1; + *m_axi_mem_[b].wready = 1; } } @@ -355,10 +358,10 @@ class xrt_sim::Impl { dram_sim_.tick(); - for (int i = 0; i < PLATFORM_MEMORY_BANKS; ++i) { - if (!dram_queues_[i].empty()) { - auto mem_req = dram_queues_[i].front(); - if (dram_sim_.send_request(mem_req->write, mem_req->addr, i, [](void* arg) { + for (int b = 0; b < PLATFORM_MEMORY_BANKS; ++b) { + if (!dram_queues_[b].empty()) { + auto mem_req = dram_queues_[b].front(); + if (dram_sim_.send_request(mem_req->write, mem_req->addr, b, [](void* arg) { auto orig_req = reinterpret_cast(arg); if (orig_req->ready) { delete orig_req; @@ -366,7 +369,7 @@ class xrt_sim::Impl { orig_req->ready = true; } }, mem_req)) { - dram_queues_[i].pop(); + dram_queues_[b].pop(); } } }