From 44181831e24b50e5b06ed7dc76639d8511aab1c8 Mon Sep 17 00:00:00 2001 From: Tim Fischer Date: Tue, 23 Jul 2024 14:40:13 +0200 Subject: [PATCH] hw: Support multi-ID DMA (#144) --- .gitlab-ci.yml | 8 + docs/rm/custom_instructions.md | 1 + docs/schema/snitch_cluster.schema.json | 5 + hw/snitch/src/snitch_pkg.sv | 4 +- hw/snitch_cluster/src/snitch_cc.sv | 44 +-- hw/snitch_cluster/src/snitch_cluster.sv | 27 +- .../src/snitch_cluster_wrapper.sv.tpl | 9 +- sw/snRuntime/src/dma.h | 279 +++++++++++++----- sw/snRuntime/src/start.c | 1 + sw/tests/dma_mchan.c | 60 ++++ sw/tests/interrupt.c | 82 ----- target/snitch_cluster/.gitignore | 3 +- target/snitch_cluster/cfg/dma_mchan.hjson | 154 ++++++++++ target/snitch_cluster/sw/dma_mchan.yaml | 6 + target/snitch_cluster/sw/run.yaml | 1 - target/snitch_cluster/sw/tests/Makefile | 2 +- 16 files changed, 483 insertions(+), 203 deletions(-) create mode 100644 sw/tests/dma_mchan.c delete mode 100644 sw/tests/interrupt.c create mode 100644 target/snitch_cluster/cfg/dma_mchan.hjson create mode 100644 target/snitch_cluster/sw/dma_mchan.yaml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 849171454..6e8f45cb1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -138,6 +138,14 @@ snitch-cluster-omega-vsim: - make bin/snitch_cluster.vsim - ./util/run.py sw/run.yaml --simulator vsim -j --run-dir runs/vsim +# Test Multi-channel DMA +snitch-cluster-mchan-vsim: + script: + - cd target/snitch_cluster + - make CFG_OVERRIDE=cfg/dma_mchan.hjson sw + - make bin/snitch_cluster.vsim + - ./util/run.py sw/dma_mchan.yaml --simulator vsim -j --run-dir runs/vsim + ############ # Non-free # ############ diff --git a/docs/rm/custom_instructions.md b/docs/rm/custom_instructions.md index 0e834b33a..913bcc7be 100644 --- a/docs/rm/custom_instructions.md +++ b/docs/rm/custom_instructions.md @@ -90,6 +90,7 @@ DMCPY and DMCPYI initiate an asynchronous data movement with the parameters conf |--------------|-------------|------------- | config[0] | decouple_rw | Decouple the handshakes of the read and write channels | config[1] | enable_2d | Enable two-dimensional transfer +| config[4:2] | channel_sel | Selects the DMA backend if a multi-channel DMA is used DMSTAT and DMSTATI place the selected *status* flag of the DMA into register *rd*. The following *status* flags are supported: diff --git a/docs/schema/snitch_cluster.schema.json b/docs/schema/snitch_cluster.schema.json index f84a0d2fd..f9d5831ca 100644 --- a/docs/schema/snitch_cluster.schema.json +++ b/docs/schema/snitch_cluster.schema.json @@ -134,6 +134,11 @@ "description": "Id width of the wide AXI plug into the cluster.", "default": 1 }, + "dma_nr_channels": { + "type": "number", + "description": "The number of separate DMA channels to instantiate.", + "default": 1 + }, "user_width": { "type": "number", "description": "User width of the narrower AXI plug into the cluster.", diff --git a/hw/snitch/src/snitch_pkg.sv b/hw/snitch/src/snitch_pkg.sv index d65b9ead2..e70325b71 100644 --- a/hw/snitch/src/snitch_pkg.sv +++ b/hw/snitch/src/snitch_pkg.sv @@ -143,8 +143,8 @@ package snitch_pkg; } cluster_slave_dma_e; typedef enum int unsigned { - SDMAMst = 32'd0, - SoCDMAIn = 32'd1, + SoCDMAIn = 32'd0, + SDMAMst = 32'd1, ICache = 32'd2 } cluster_master_dma_e; diff --git a/hw/snitch_cluster/src/snitch_cc.sv b/hw/snitch_cluster/src/snitch_cc.sv index 15d8abff5..0040a5654 100644 --- a/hw/snitch_cluster/src/snitch_cc.sv +++ b/hw/snitch_cluster/src/snitch_cc.sv @@ -23,6 +23,7 @@ module snitch_cc #( parameter int unsigned DMAUserWidth = 0, parameter int unsigned DMANumAxInFlight = 0, parameter int unsigned DMAReqFifoDepth = 0, + parameter int unsigned DMANumChannels = 0, /// Data port request type. parameter type dreq_t = logic, /// Data port response type. @@ -114,33 +115,33 @@ module snitch_cc #( parameter type addr_t = logic [AddrWidth-1:0], parameter type data_t = logic [DataWidth-1:0] ) ( - input logic clk_i, - input logic clk_d2_i, - input logic rst_ni, - input logic rst_int_ss_ni, - input logic rst_fp_ss_ni, - input logic [31:0] hart_id_i, - input snitch_pkg::interrupts_t irq_i, - output hive_req_t hive_req_o, - input hive_rsp_t hive_rsp_i, + input logic clk_i, + input logic clk_d2_i, + input logic rst_ni, + input logic rst_int_ss_ni, + input logic rst_fp_ss_ni, + input logic [31:0] hart_id_i, + input snitch_pkg::interrupts_t irq_i, + output hive_req_t hive_req_o, + input hive_rsp_t hive_rsp_i, // Core data ports - output dreq_t data_req_o, - input drsp_t data_rsp_i, + output dreq_t data_req_o, + input drsp_t data_rsp_i, // TCDM Streamer Ports - output tcdm_req_t [TCDMPorts-1:0] tcdm_req_o, - input tcdm_rsp_t [TCDMPorts-1:0] tcdm_rsp_i, + output tcdm_req_t [TCDMPorts-1:0] tcdm_req_o, + input tcdm_rsp_t [TCDMPorts-1:0] tcdm_rsp_i, // Accelerator Offload port // DMA ports - output axi_req_t axi_dma_req_o, - input axi_rsp_t axi_dma_res_i, - output logic axi_dma_busy_o, - output dma_events_t axi_dma_events_o, + output axi_req_t [DMANumChannels-1:0] axi_dma_req_o, + input axi_rsp_t [DMANumChannels-1:0] axi_dma_res_i, + output logic [DMANumChannels-1:0] axi_dma_busy_o, + output dma_events_t [DMANumChannels-1:0] axi_dma_events_o, // Core event strobes - output snitch_pkg::core_events_t core_events_o, - input addr_t tcdm_addr_base_i, + output snitch_pkg::core_events_t core_events_o, + input addr_t tcdm_addr_base_i, // Cluster HW barrier - output logic barrier_o, - input logic barrier_i + output logic barrier_o, + input logic barrier_i ); // FMA architecture is "merged" -> mulexp and macexp instructions are supported @@ -385,6 +386,7 @@ module snitch_cc #( .AxiUserWidth (DMAUserWidth), .NumAxInFlight (DMANumAxInFlight), .DMAReqFifoDepth (DMAReqFifoDepth), + .NumChannels (DMANumChannels), .axi_ar_chan_t (axi_ar_chan_t), .axi_aw_chan_t (axi_aw_chan_t), .axi_req_t (axi_req_t), diff --git a/hw/snitch_cluster/src/snitch_cluster.sv b/hw/snitch_cluster/src/snitch_cluster.sv index fa657e52a..4e4c4b00c 100644 --- a/hw/snitch_cluster/src/snitch_cluster.sv +++ b/hw/snitch_cluster/src/snitch_cluster.sv @@ -31,7 +31,7 @@ module snitch_cluster parameter int unsigned WideDataWidth = 512, /// AXI: id width in. parameter int unsigned NarrowIdWidthIn = 2, - /// AXI: dma id with in *currently not available* + /// AXI: dma id width in. parameter int unsigned WideIdWidthIn = 2, /// AXI: user width. parameter int unsigned NarrowUserWidth = 1, @@ -57,6 +57,8 @@ module snitch_cluster parameter int unsigned DMANumAxInFlight = 3, /// Size of DMA request fifo. parameter int unsigned DMAReqFifoDepth = 3, + /// Number of DMA channels. + parameter int unsigned DMANumChannels = 1, /// Width of a single icache line. parameter int unsigned ICacheLineWidth [NrHives] = '{default: 0}, /// Number of icache lines per set. @@ -274,8 +276,8 @@ module snitch_cluster localparam int unsigned NrRuleIdcs = NrSlaves - 1; localparam int unsigned NrRules = (1 + AliasRegionEnable) * NrRuleIdcs; - // DMA, SoC Request, `n` instruction caches. - localparam int unsigned NrWideMasters = 2 + NrHives; + // SoC Request, DMA Channels, `n` instruction caches. + localparam int unsigned NrWideMasters = 1 + DMANumChannels + NrHives; localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn; // DMA X-BAR configuration localparam int unsigned NrWideSlaves = 3; @@ -837,10 +839,10 @@ module snitch_cluster localparam int unsigned TcdmPorts = get_tcdm_ports(i); localparam int unsigned TcdmPortsOffs = get_tcdm_port_offs(i); - axi_mst_dma_req_t axi_dma_req; - axi_mst_dma_resp_t axi_dma_res; + axi_mst_dma_req_t [DMANumChannels-1:0] axi_dma_req; + axi_mst_dma_resp_t [DMANumChannels-1:0] axi_dma_res; interrupts_t irq; - dma_events_t dma_core_events; + dma_events_t [DMANumChannels-1:0] dma_core_events; sync #(.STAGES (2)) i_sync_debug (.clk_i, .rst_ni, .serial_i (debug_req_i[i]), .serial_o (irq.debug)); @@ -863,6 +865,7 @@ module snitch_cluster .SnitchPMACfg (SnitchPMACfg), .DMANumAxInFlight (DMANumAxInFlight), .DMAReqFifoDepth (DMAReqFifoDepth), + .DMANumChannels (DMANumChannels), .dreq_t (reqrsp_req_t), .drsp_t (reqrsp_rsp_t), .tcdm_req_t (tcdm_req_t), @@ -951,9 +954,11 @@ module snitch_cluster end end if (Xdma[i]) begin : gen_dma_connection - assign wide_axi_mst_req[SDMAMst] = axi_dma_req; - assign axi_dma_res = wide_axi_mst_rsp[SDMAMst]; - assign dma_events = dma_core_events; + for (genvar j = 0; j < DMANumChannels; j++) begin : gen_dma_connection + assign wide_axi_mst_req[SDMAMst + j] = axi_dma_req[j]; + assign axi_dma_res[j] = wide_axi_mst_rsp[SDMAMst + j]; + end + assign dma_events = dma_core_events[0]; // Only first channel is tracked end end @@ -1001,8 +1006,8 @@ module snitch_cluster .hive_rsp_o (hive_rsp_reshape), .ptw_data_req_o (ptw_req[i]), .ptw_data_rsp_i (ptw_rsp[i]), - .axi_req_o (wide_axi_mst_req[ICache+i]), - .axi_rsp_i (wide_axi_mst_rsp[ICache+i]), + .axi_req_o (wide_axi_mst_req[SDMAMst+DMANumChannels+i]), + .axi_rsp_i (wide_axi_mst_rsp[SDMAMst+DMANumChannels+i]), .icache_prefetch_enable_i (icache_prefetch_enable), .icache_events_o(icache_events_reshape), .sram_cfgs_i diff --git a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl index c40f50440..b3c35d9ac 100644 --- a/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl +++ b/hw/snitch_cluster/src/snitch_cluster_wrapper.sv.tpl @@ -54,12 +54,12 @@ package ${cfg['pkg_name']}; localparam int unsigned WideDataWidth = ${cfg['dma_data_width']}; localparam int unsigned NarrowIdWidthIn = ${cfg['id_width_in']}; - localparam int unsigned NrMasters = 3; - localparam int unsigned NarrowIdWidthOut = $clog2(NrMasters) + NarrowIdWidthIn; + localparam int unsigned NrNarrowMasters = 3; + localparam int unsigned NarrowIdWidthOut = $clog2(NrNarrowMasters) + NarrowIdWidthIn; - localparam int unsigned NrDmaMasters = 2 + ${cfg['nr_hives']}; + localparam int unsigned NrWideMasters = 1 + ${cfg['dma_nr_channels']} + ${cfg['nr_hives']}; localparam int unsigned WideIdWidthIn = ${cfg['dma_id_width_in']}; - localparam int unsigned WideIdWidthOut = $clog2(NrDmaMasters) + WideIdWidthIn; + localparam int unsigned WideIdWidthOut = $clog2(NrWideMasters) + WideIdWidthIn; localparam int unsigned NarrowUserWidth = ${cfg['user_width']}; localparam int unsigned WideUserWidth = ${cfg['dma_user_width']}; @@ -274,6 +274,7 @@ module ${cfg['name']}_wrapper ( .NrBanks (${cfg['tcdm']['banks']}), .DMANumAxInFlight (${cfg['dma_axi_req_fifo_depth']}), .DMAReqFifoDepth (${cfg['dma_req_fifo_depth']}), + .DMANumChannels (${cfg['dma_nr_channels']}), .ICacheLineWidth (${cfg['pkg_name']}::ICacheLineWidth), .ICacheLineCount (${cfg['pkg_name']}::ICacheLineCount), .ICacheSets (${cfg['pkg_name']}::ICacheSets), diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h index 3e4dc0573..637152a1d 100644 --- a/sw/snRuntime/src/dma.h +++ b/sw/snRuntime/src/dma.h @@ -4,12 +4,26 @@ #pragma once +#define OP_CUSTOM1 0b0101011 +#define XDMA_FUNCT3 0b000 +#define DMSRC_FUNCT7 0b0000000 +#define DMDST_FUNCT7 0b0000001 +#define DMCPYI_FUNCT7 0b0000010 +#define DMCPY_FUNCT7 0b0000011 +#define DMSTATI_FUNCT7 0b0000100 +#define DMSTAT_FUNCT7 0b0000101 +#define DMSTR_FUNCT7 0b0000110 +#define DMREP_FUNCT7 0b0000111 + +#define R_TYPE_ENCODE(funct7, rs2, rs1, funct3, rd, opcode) \ + ((funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | \ + (opcode)) + /// A DMA transfer identifier. typedef uint32_t snrt_dma_txid_t; -/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers. -inline snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, - size_t size) { +inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, + size_t size) { register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 register uint32_t reg_src_low asm("a2") = src >> 0; // 12 @@ -17,34 +31,22 @@ inline snrt_dma_txid_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, register uint32_t reg_size asm("a4") = size; // 14 // dmsrc a2, a3 - asm volatile( - ".word (0b0000000 << 25) | \ - ( (13) << 20) | \ - ( (12) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_src_high), - "r"(reg_src_low)); + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMSRC_FUNCT7, 13, 12, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_src_high), "r"(reg_src_low)); // dmdst a0, a1 - asm volatile( - ".word (0b0000001 << 25) | \ - ( (11) << 20) | \ - ( (10) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_dst_high), - "r"(reg_dst_low)); + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMDST_FUNCT7, 11, 10, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_dst_high), "r"(reg_dst_low)); // dmcpyi a0, a4, 0b00 register uint32_t reg_txid asm("a0"); // 10 - asm volatile( - ".word (0b0000010 << 25) | \ - ( 0b00000 << 20) | \ - ( (14) << 15) | \ - ( 0b000 << 12) | \ - ( (10) << 7) | \ - (0b0101011 << 0) \n" - : "=r"(reg_txid) - : "r"(reg_size)); + asm volatile(".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMCPYI_FUNCT7, 0b00000, 14, XDMA_FUNCT3, + 10, OP_CUSTOM1)), + "r"(reg_size)); return reg_txid; } @@ -70,53 +72,32 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, register uint32_t reg_repeat asm("a7") = repeat; // 17 // dmsrc a0, a1 - asm volatile( - ".word (0b0000000 << 25) | \ - ( (13) << 20) | \ - ( (12) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_src_high), - "r"(reg_src_low)); + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMSRC_FUNCT7, 13, 12, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_src_high), "r"(reg_src_low)); // dmdst a0, a1 - asm volatile( - ".word (0b0000001 << 25) | \ - ( (11) << 20) | \ - ( (10) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" ::"r"(reg_dst_high), - "r"(reg_dst_low)); + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMDST_FUNCT7, 11, 10, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_dst_high), "r"(reg_dst_low)); // dmstr a5, a6 - asm volatile( - ".word (0b0000110 << 25) | \ - ( (15) << 20) | \ - ( (16) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" - : - : "r"(reg_dst_stride), "r"(reg_src_stride)); + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMSTR_FUNCT7, 15, 16, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_src_stride), "r"(reg_dst_stride)); // dmrep a7 - asm volatile( - ".word (0b0000111 << 25) | \ - ( (17) << 15) | \ - ( 0b000 << 12) | \ - (0b0101011 << 0) \n" - : - : "r"(reg_repeat)); + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMREP_FUNCT7, 0, 17, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_repeat)); // dmcpyi a0, a4, 0b10 register uint32_t reg_txid asm("a0"); // 10 - asm volatile( - ".word (0b0000010 << 25) | \ - ( 0b00010 << 20) | \ - ( (14) << 15) | \ - ( 0b000 << 12) | \ - ( (10) << 7) | \ - (0b0101011 << 0) \n" - : "=r"(reg_txid) - : "r"(reg_size)); + asm volatile(".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMCPYI_FUNCT7, 0b00010, 14, XDMA_FUNCT3, + 10, OP_CUSTOM1)), + "r"(reg_size)); return reg_txid; } @@ -129,32 +110,163 @@ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, src_stride, repeat); } +/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a +/// specific channel. +inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst, + uint64_t src, + size_t size, + uint32_t channel) { + register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 + register uint32_t reg_src_low asm("a2") = src >> 0; // 12 + register uint32_t reg_src_high asm("a3") = src >> 32; // 13 + register uint32_t reg_size asm("a4") = size; // 14 + register uint32_t cfg asm("a5") = channel << 2; // 15 + + // dmsrc a2, a3 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMSRC_FUNCT7, 13, 12, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_src_high), "r"(reg_src_low)); + + // dmdst a0, a1 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMDST_FUNCT7, 11, 10, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_dst_high), "r"(reg_dst_low)); + + // dmcpy a0, a4, a5 + register uint32_t reg_txid asm("a0"); // 10 + asm volatile( + ".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMCPY_FUNCT7, 15, 14, XDMA_FUNCT3, 10, OP_CUSTOM1)), + "r"(reg_size), "r"(cfg)); + + return reg_txid; +} + +/// Initiate an asynchronous 1D DMA transfer and a specific channel. +inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src, + size_t size, + uint32_t channel) { + return snrt_dma_start_1d_channel_wideptr((size_t)dst, (size_t)src, size, + channel); +} + +/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a +/// specific channel. +inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr( + uint64_t dst, uint64_t src, size_t size, size_t dst_stride, + size_t src_stride, size_t repeat, uint32_t channel) { + register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 + register uint32_t reg_dst_high asm("a1") = dst >> 32; // 11 + register uint32_t reg_src_low asm("a2") = src >> 0; // 12 + register uint32_t reg_src_high asm("a3") = src >> 32; // 13 + register uint32_t reg_size asm("a4") = size; // 14 + register uint32_t reg_dst_stride asm("a5") = dst_stride; // 15 + register uint32_t reg_src_stride asm("a6") = src_stride; // 16 + register uint32_t reg_repeat asm("a7") = repeat; // 17 + register uint32_t cfg asm("t2") = channel << 2 | 2; // 7 + + // dmsrc a0, a1 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMSRC_FUNCT7, 13, 12, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_src_high), "r"(reg_src_low)); + + // dmdst a0, a1 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMDST_FUNCT7, 11, 10, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_dst_high), "r"(reg_dst_low)); + + // dmstr a5, a6 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMSTR_FUNCT7, 15, 16, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_src_stride), "r"(reg_dst_stride)); + + // dmrep a7 + asm volatile(".word %0\n" ::"i"(R_TYPE_ENCODE(DMREP_FUNCT7, 0, 17, + XDMA_FUNCT3, 0, OP_CUSTOM1)), + "r"(reg_repeat)); + + // dmcpy a0, a4, t2 + register uint32_t reg_txid asm("a0"); // 10 + asm volatile( + ".word %1\n" + : "=r"(reg_txid) + : "i"(R_TYPE_ENCODE(DMCPY_FUNCT7, 7, 14, XDMA_FUNCT3, 10, OP_CUSTOM1)), + "r"(cfg), "r"(reg_size)); + + return reg_txid; +} + +/// Initiate an asynchronous 2D DMA transfer and a specific channel. +inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src, + size_t size, size_t dst_stride, + size_t src_stride, + size_t repeat, + uint32_t channel) { + return snrt_dma_start_2d_channel_wideptr((size_t)dst, (size_t)src, size, + dst_stride, src_stride, repeat, + channel); +} + /// Block until a transfer finishes. inline void snrt_dma_wait(snrt_dma_txid_t tid) { - // dmstati t0, 0 # 2=status.completed_id + // dmstati t0, 0 # 0=status.completed_id asm volatile( "1: \n" - ".word (0b0000100 << 25) | \ - ( 0b00000 << 20) | \ - ( 0b000 << 12) | \ - ( (5) << 7) | \ - (0b0101011 << 0) \n" - "bltu t0, %0, 1b \n" ::"r"(tid) + ".word %0\n" + "bltu t0, %1, 1b \n" ::"i"( + R_TYPE_ENCODE(DMSTATI_FUNCT7, 0b00, 0, XDMA_FUNCT3, 5, OP_CUSTOM1)), + "r"(tid) : "t0"); } +/// Block until a transfer finishes on a specific channel. +inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) { + // dmstati t0, 0 # 0=status.completed_id + register uint32_t cfg asm("t1") = channel << 2; + asm volatile( + "1: \n" + ".word %0\n" + "sub t0, t0, %1 \n" + "blez t0, 1b \n" ::"i"( + R_TYPE_ENCODE(DMSTAT_FUNCT7, 6, 0, XDMA_FUNCT3, 5, OP_CUSTOM1)), + "r"(tid), "r"(cfg) + : "t0", "t1"); +} + /// Block until all operation on the DMA ceases. inline void snrt_dma_wait_all() { // dmstati t0, 2 # 2=status.busy asm volatile( "1: \n" - ".word (0b0000100 << 25) | \ - ( 0b00010 << 20) | \ - ( 0b000 << 12) | \ - ( (5) << 7) | \ - (0b0101011 << 0) \n" - "bne t0, zero, 1b \n" :: - : "t0"); + ".word %0\n" + "bne t0, zero, 1b \n" ::"i"( + R_TYPE_ENCODE(DMSTATI_FUNCT7, 0b10, 0, XDMA_FUNCT3, 5, OP_CUSTOM1)) + : "t0"); +} + +/// Block until all operation on the DMA ceases on a specific channel. +inline void snrt_dma_wait_all_channel(uint32_t channel) { + register uint32_t tmp; + // dmstati t0, 2 # 2=status.busy + register uint32_t cfg asm("t1") = channel << 2 | 2; + asm volatile( + "1: \n" + ".word %0\n" + "bne t0, zero, 1b \n" ::"i"( + R_TYPE_ENCODE(DMSTAT_FUNCT7, 6, 0, XDMA_FUNCT3, 5, OP_CUSTOM1)), + "r"(cfg) + : "t0"); +} + +/// Wait until all channels are idle +inline void snrt_dma_wait_all_channels(uint32_t num_channels) { + register uint32_t tmp; + // dmstati t0, 2 # 2=status.busy + for (int c = 0; c < num_channels; c++) { + snrt_dma_wait_all_channel(c); + } } /** @@ -163,7 +275,11 @@ inline void snrt_dma_wait_all() { * analyzed * */ -inline void snrt_dma_start_tracking() { asm volatile("dmstati zero, 1"); } +inline void snrt_dma_start_tracking() { + // dmstati zero, 0 + asm volatile(".word %0\n" ::"i"( + R_TYPE_ENCODE(DMSTATI_FUNCT7, 0b00, 0, XDMA_FUNCT3, 0, OP_CUSTOM1))); +} /** * @brief stop tracking of dma performance region. Does not have any @@ -171,7 +287,10 @@ inline void snrt_dma_start_tracking() { asm volatile("dmstati zero, 1"); } * analyzed * */ -inline void snrt_dma_stop_tracking() { asm volatile("dmstati zero, 3"); } +inline void snrt_dma_stop_tracking() { + asm volatile(".word %0\n" ::"i"( + R_TYPE_ENCODE(DMSTATI_FUNCT7, 0b00, 0, XDMA_FUNCT3, 3, OP_CUSTOM1))); +} /** * @brief fast memset function performed by DMA diff --git a/sw/snRuntime/src/start.c b/sw/snRuntime/src/start.c index 7f117c206..c91f4c25c 100644 --- a/sw/snRuntime/src/start.c +++ b/sw/snRuntime/src/start.c @@ -53,6 +53,7 @@ static inline void snrt_init_tls() { snrt_dma_start_1d((void*)(tls_ptr + i * tls_offset), (void*)(snrt_zero_memory_ptr()), size); } + snrt_dma_wait_all(); } snrt_cluster_hw_barrier(); diff --git a/sw/tests/dma_mchan.c b/sw/tests/dma_mchan.c new file mode 100644 index 000000000..14b7f33ee --- /dev/null +++ b/sw/tests/dma_mchan.c @@ -0,0 +1,60 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include + +#define TRANSFER_SIZE 64 +#define TRANSFER_LEN TRANSFER_SIZE / sizeof(uint32_t) +#define TRANSFER_REP 64 + +uint32_t buffer_src_l3[TRANSFER_LEN * TRANSFER_REP]; + +int main() { + if (!snrt_is_dm_core()) return 0; // only DMA core + uint32_t errors = 0; + uint32_t buffer_src_l1[TRANSFER_LEN]; + uint32_t buffer_dst_l1_1[TRANSFER_LEN]; + uint32_t buffer_dst_l1_2[TRANSFER_LEN]; + + // Populate buffers. + for (uint32_t i = 0; i < TRANSFER_LEN; i++) { + buffer_src_l1[i] = 0xAAAAAAAA; + buffer_dst_l1_1[i] = 0x55555555; + buffer_dst_l1_2[i] = 0xBBBBBBBB; + } + for (uint32_t i = 0; i < TRANSFER_LEN * TRANSFER_REP; i++) { + buffer_src_l3[i] = i + 1; + } + + // Start slow/large 2D transfer from L3 to L1 on channel 0. + snrt_dma_start_2d_channel(buffer_dst_l1_1, buffer_src_l3, TRANSFER_SIZE, 0, + TRANSFER_SIZE, TRANSFER_REP, 0); + + // Start fast/small 1D transfer from L1 to L1 on channel 1. + snrt_dma_start_1d_channel(buffer_dst_l1_2, buffer_src_l1, TRANSFER_SIZE, 1); + + // Check that the fast transfer can finish first. + uint32_t busy_slow, busy_fast; + do { + asm volatile("dmstati %0, 2" : "=r"(busy_slow)); + asm volatile("dmstati %0, 6" : "=r"(busy_fast)); + } while (busy_fast); + + // Check that the fast transfer has finished first. + if (!busy_slow) { + errors++; + } + + // Wait for the slow transfer to finish. + snrt_dma_wait_all_channel(0); + + // Check that the main memory buffer contains the correct data. + for (uint32_t i = 0; i < TRANSFER_LEN; i++) { + errors += + (buffer_dst_l1_1[i] != TRANSFER_LEN * (TRANSFER_REP - 1) + i + 1); + errors += (buffer_dst_l1_2[i] != 0xAAAAAAAA); + } + + return errors; +} diff --git a/sw/tests/interrupt.c b/sw/tests/interrupt.c deleted file mode 100644 index 3d1ef907a..000000000 --- a/sw/tests/interrupt.c +++ /dev/null @@ -1,82 +0,0 @@ -// Copyright 2020 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -#include "snrt.h" - -// Test output printf -#define tprintf(...) printf(__VA_ARGS__) - -// Progress printf -#define pprintf(...) printf(__VA_ARGS__) - -volatile static int32_t global_flag; - -void sleep_loop(uint32_t cluster_core_idx, volatile int32_t *flag) { - snrt_interrupt_enable(IRQ_M_CLUSTER); - for (unsigned i = 8; i; --i) { - snrt_wfi(); - snrt_int_cluster_clr(1 << cluster_core_idx); - __atomic_add_fetch(flag, 1, __ATOMIC_RELAXED); - while (__atomic_load_n(flag, __ATOMIC_RELAXED)) - ; - snrt_cluster_hw_barrier(); - } - snrt_interrupt_disable(IRQ_M_CLUSTER); -} - -int main() { - unsigned cluster_idx = snrt_cluster_idx(); - unsigned core_idx = snrt_global_core_idx(); - unsigned core_num = snrt_global_core_num(); - - // Test1: Core 0 sends interrupts to each hart sequentially and - // polls the flag and checks for correctness - if (core_idx == 0) { - for (unsigned i = 1; i < core_num; i++) { - tprintf("IRQ %d ..", i); - global_flag = -1; - snrt_int_sw_set(snrt_global_core_base_hartid() + i); - while (global_flag != (int)i) - ; - tprintf("OK\n", global_flag); - } - } else { - snrt_interrupt_enable(IRQ_M_SOFT); - snrt_interrupt_global_enable(); - asm volatile("wfi"); - snrt_interrupt_global_disable(); - } - snrt_global_barrier(); - if (core_idx == 0) pprintf("Test 1 complete\n"); - - // Test2: Enable software interrupt wihout jumping to the exception - // address - if (core_idx == 0) { - for (unsigned i = 1; i < core_num; i++) { - tprintf("trig %d..", i); - global_flag = -1; - snrt_int_sw_set(snrt_global_core_base_hartid() + i); - while (global_flag != ((int)i << 8)) - ; - tprintf("OK\n", global_flag); - } - } else { - snrt_interrupt_enable(IRQ_M_SOFT); - asm volatile("wfi"); - // interrupts are disabled so the mcause register is not updated - if (snrt_int_sw_get(snrt_hartid())) { - snrt_int_sw_clear(snrt_global_core_base_hartid() + core_idx); - global_flag = core_idx << 8; - } - } - snrt_global_barrier(); - if (core_idx == 0) pprintf("Test 2 complete\n"); - - return 0; -} - -void irq_m_soft(uint32_t core_idx) { - snrt_int_sw_clear(snrt_global_core_base_hartid() + core_idx); - global_flag = core_idx; -} diff --git a/target/snitch_cluster/.gitignore b/target/snitch_cluster/.gitignore index f74d9fde4..ea98bf82c 100644 --- a/target/snitch_cluster/.gitignore +++ b/target/snitch_cluster/.gitignore @@ -7,4 +7,5 @@ /work-vlt/ /work-vcs/ /*.log -/runs/ \ No newline at end of file +/runs/ +.rtlbinary diff --git a/target/snitch_cluster/cfg/dma_mchan.hjson b/target/snitch_cluster/cfg/dma_mchan.hjson new file mode 100644 index 000000000..4c43235b3 --- /dev/null +++ b/target/snitch_cluster/cfg/dma_mchan.hjson @@ -0,0 +1,154 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Cluster configuration for a simple testbench system. +{ + nr_s1_quadrant: 1, + s1_quadrant: { + nr_clusters: 1, + }, + + cluster: { + boot_addr: 4096, // 0x1000 + cluster_base_addr: 268435456, // 0x1000_0000 + cluster_base_offset: 0, // 0x0 + cluster_base_hartid: 0, + addr_width: 48, + data_width: 64, + user_width: 5, // clog2(total number of clusters) + tcdm: { + size: 128, + banks: 32, + }, + cluster_periph_size: 64, // kB + zero_mem_size: 64, // kB + alias_region_enable: true, + dma_data_width: 512, + dma_nr_channels: 4, + dma_axi_req_fifo_depth: 24, + dma_req_fifo_depth: 8, + narrow_trans: 4, + wide_trans: 32, + dma_user_width: 1, + // We don't need Snitch debugging in Occamy + enable_debug: false, + // We don't need Snitch (core-internal) virtual memory support + vm_support: false, + // Memory configuration inputs + sram_cfg_expose: true, + sram_cfg_fields: { + ema: 3, + emaw: 2, + emas: 1 + }, + // Timing parameters + timing: { + lat_comp_fp32: 2, + lat_comp_fp64: 3, + lat_comp_fp16: 1, + lat_comp_fp16_alt: 1, + lat_comp_fp8: 1, + lat_comp_fp8_alt: 1, + lat_noncomp: 1, + lat_conv: 2, + lat_sdotp: 3, + fpu_pipe_config: "BEFORE", + narrow_xbar_latency: "CUT_ALL_PORTS", + wide_xbar_latency: "CUT_ALL_PORTS", + // Isolate the core. + register_core_req: true, + register_core_rsp: true, + register_offload_req: true, + register_offload_rsp: true, + register_fpu_req: true, + register_ext_narrow: false, + register_ext_wide: false + }, + hives: [ + // Hive 0 + { + icache: { + size: 8, // total instruction cache size in kByte + sets: 2, // number of ways + cacheline: 256 // word size in bits + }, + cores: [ + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/compute_core_template" }, + { $ref: "#/dma_core_template" }, + ] + } + ] + }, + dram: { + // 0x8000_0000 + address: 2147483648, + // 0x8000_0000 + length: 2147483648 + }, + peripherals: { + clint: { + // 0xffff_0000 + address: 4294901760, + // 0x0000_1000 + length: 4096 + }, + }, + // Templates. + compute_core_template: { + isa: "rv32imafd", + xssr: true, + xfrep: true, + xdma: false, + xf16: true, + xf16alt: true, + xf8: true, + xf8alt: true, + xfdotp: true, + xfvec: true, + ssr_nr_credits: 4, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + // SSSR configuration below + ssr_intersection: true, + ssr_intersection_triple: [0, 1, 2], + ssrs: [ + {indirection: true}, // Master 0 + {indirection: true}, // Master 1 + {}, // Slave + ], + }, + dma_core_template: { + isa: "rv32imafd", + // Xdiv_sqrt: true, + // isa: "rv32ema", + xdma: true, + xssr: false, + xfrep: false, + xf16: false, + xf16alt: false, + xf8: false, + xf8alt: false, + xfdotp: false, + xfvec: false, + num_int_outstanding_loads: 1, + num_int_outstanding_mem: 4, + num_fp_outstanding_loads: 4, + num_fp_outstanding_mem: 4, + num_sequencer_instructions: 16, + num_dtlb_entries: 1, + num_itlb_entries: 1, + } +} diff --git a/target/snitch_cluster/sw/dma_mchan.yaml b/target/snitch_cluster/sw/dma_mchan.yaml new file mode 100644 index 000000000..0f9f4b16b --- /dev/null +++ b/target/snitch_cluster/sw/dma_mchan.yaml @@ -0,0 +1,6 @@ +# Copyright 2023 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +runs: + - elf: tests/build/dma_mchan.elf diff --git a/target/snitch_cluster/sw/run.yaml b/target/snitch_cluster/sw/run.yaml index 45af517b1..181d96ab7 100644 --- a/target/snitch_cluster/sw/run.yaml +++ b/target/snitch_cluster/sw/run.yaml @@ -55,7 +55,6 @@ runs: simulators: [vsim, vcs, verilator] # banshee fails with illegal instruction # - elf: tests/build/fp64_conversions_scalar.elf # simulators: [vsim, vcs, verilator] - # - elf: tests/build/interrupt.elf - elf: tests/build/interrupt_local.elf - elf: tests/build/multi_cluster.elf - elf: tests/build/openmp_parallel.elf diff --git a/target/snitch_cluster/sw/tests/Makefile b/target/snitch_cluster/sw/tests/Makefile index b2d16fbac..87a488e9a 100644 --- a/target/snitch_cluster/sw/tests/Makefile +++ b/target/snitch_cluster/sw/tests/Makefile @@ -44,7 +44,7 @@ RISCV_LDFLAGS += -lsnRuntime # Outputs # ########### -APPS = $(shell $(MK_DIR)/../../../../util/sim/list_apps.py --in-dir tests/ ../run.yaml) +APPS = $(basename $(notdir $(wildcard $(SRC_DIR)/*.c))) ELFS = $(abspath $(addprefix $(BUILDDIR)/,$(addsuffix .elf,$(APPS)))) DEPS = $(abspath $(addprefix $(BUILDDIR)/,$(addsuffix .d,$(APPS)))) DUMPS = $(abspath $(addprefix $(BUILDDIR)/,$(addsuffix .dump,$(APPS))))