diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h index 29ffb81e52..db6a2916bf 100644 --- a/sw/snRuntime/src/alloc_v2.h +++ b/sw/snRuntime/src/alloc_v2.h @@ -2,29 +2,61 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief Defines functions to dynamically allocate the cluster's L1 memory. + * + * This file provides functions to dynamically allocate the cluster's L1 + * memory. It includes functions for allocating memory for cluster-local + * variables, compute core-local variables, and for manipulating pointers to + * variables allocated by different cores or clusters. + */ + extern __thread snrt_allocator_t l1_allocator_v2; +/** + * @brief Get a pointer to the L1 allocator. + * + * @return Pointer to the L1 allocator. + */ inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; } +/** + * @brief Get the next pointer of the L1 allocator. + * + * @return The next pointer of the L1 allocator. + */ inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; } /** - * @brief Override the L1 allocator next pointer + * @brief Override the L1 allocator next pointer. + * + * @param next The new value for the next pointer. */ inline void snrt_l1_update_next_v2(void *next) { snrt_l1_allocator_v2()->next = (uint32_t)next; } -// Check that allocation doesn't exceed allocator bounds, and raise an -// exception otherwise +/** + * @brief Check if the allocation exceeds the allocator bounds and raise an + * exception if it does. + */ inline void snrt_l1_alloc_check_bounds() { if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end) asm volatile("ecall \n"); } -// Dynamically allocate space for a variable of size `size` in the cluster's L1 -// memory. This function should be invoked by every core in a cluster. Every -// core receives a pointer to the allocated variable. +/** + * @brief Allocate space for a variable in the cluster's L1 memory. + * + * This function dynamically allocates space for a variable of size `size` in + * the cluster's L1 memory. + * The allocation is aligned to the specified `alignment`. + * + * @param size The size of the variable to allocate. + * @param alignment The alignment of the allocation. + * @return Pointer to the allocated variable. + */ inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) { snrt_l1_allocator_v2()->next = ALIGN_UP(snrt_l1_allocator_v2()->next, alignment); @@ -34,11 +66,19 @@ inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) { return retval; } -// Dynamically allocate space for N variables of size `size` in the cluster's -// L1 memory, N being the number of compute cores in the cluster. This function -// should be invoked by every core in a cluster. Every compute core receives a -// pointer to a unique variable among the N which have been allocated. The -// return value for the DM core is undefined. +/** + * @brief Allocate space for N variables in the cluster's L1 memory. + * + * This function dynamically allocates space for N variables of size `size` in + * the cluster's L1 memory, where N is the number of compute cores in the + * cluster. The variables are allocated in a contiguous block of memory. + * The whole block is aligned to the specified `alignment`. + * + * @param size The size of each variable to allocate. + * @param alignment The alignment of the allocation. + * @return Pointer to the allocated variable for each compute core. + * The return value for the DM core is undefined. + */ inline void *snrt_l1_alloc_compute_core_local(size_t size, const size_t alignment) { snrt_l1_allocator_v2()->next = @@ -49,24 +89,52 @@ inline void *snrt_l1_alloc_compute_core_local(size_t size, return retval; } -// Takes a pointer to a variable allocated using -// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same -// variable allocated by another core, as specified by `core_idx`. -// The `size` argument should be the same used during allocation. +/** + * @brief Get a pointer to the same variable allocated by another core. + * + * This function takes a pointer to a variable allocated using + * `snrt_l1_alloc_compute_core_local` and returns a pointer to the same + * variable allocated by another core, as specified by `core_idx`. + * The `size` argument should be the same used during allocation. + * + * @param ptr Pointer to the variable allocated by the current core. + * @param core_idx Index of the core that allocated the variable. + * @param size The size of the variable. + * @return Pointer to the same variable allocated by the specified core. + */ inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx, size_t size) { size_t offset = (core_idx - snrt_cluster_core_idx()) * size; return (void *)((uintptr_t)ptr + offset); } -// Takes a pointer to a variable in the source cluster's L1 memory and returns -// a pointer to the same offset in the destination cluster's L1 memory. +/** + * @brief Get a pointer to the same offset in another cluster's L1 memory. + * + * This function takes a pointer to a variable in the calling (source) + * cluster's L1 memory and returns a pointer to the same offset in the target + * (destination) cluster's L1 memory. + * + * @param ptr Pointer to the variable in the source cluster's L1 memory. + * @param src_cluster_idx Index of the source cluster. + * @param dst_cluster_idx Index of the destination cluster. + * @return Pointer to the same offset in the destination cluster's L1 memory. + */ inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx, uint32_t dst_cluster_idx) { return (void *)((uintptr_t)ptr + (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET); } +/** + * @brief Initialize the L1 allocator. + * + * This function initializes the L1 allocator by calculating the end address + * of the heap and setting the base, end, and next pointers of the allocator. + * + * @note This function should be called before using any of the allocation + * functions. + */ inline void snrt_alloc_init_v2() { // Calculate end address of the heap. The top of the TCDM address space is // reserved for the cluster-local storage (CLS) and the stack of every diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h index a4b82a7d32..e8147be7d0 100644 --- a/sw/snRuntime/src/dma.h +++ b/sw/snRuntime/src/dma.h @@ -2,6 +2,11 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief This file provides functions to program the Snitch DMA. + */ + #pragma once #define OP_CUSTOM1 0b0101011 @@ -19,9 +24,17 @@ ((funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | \ (opcode)) + /// A DMA transfer identifier. typedef uint32_t snrt_dma_txid_t; +/** + * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers. + * @param dst The destination address. + * @param src The source address. + * @param size The size of the transfer in bytes. + * @return The DMA transfer ID. + */ inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, size_t size) { register uint32_t reg_dst_low asm("a0") = dst >> 0; // 10 @@ -51,13 +64,30 @@ inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src, return reg_txid; } -/// Initiate an asynchronous 1D DMA transfer. +/** + * @brief Start an asynchronous 1D DMA transfer with native-size pointers. + * @param dst The destination pointer. + * @param src The source pointer. + * @param size The size of the transfer in bytes. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src, size_t size) { return snrt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size); } -/// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers. +/** + * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, @@ -102,7 +132,18 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src, return reg_txid; } -/// Initiate an asynchronous 2D DMA transfer. +/** + * @brief Start an asynchronous 2D DMA transfer with native-size pointers. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat) { @@ -110,8 +151,15 @@ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src, src_stride, repeat); } -/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a -/// specific channel. +/** + * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a + * specific channel. + * @param dst The destination address. + * @param src The source address. + * @param size The size of the transfer in bytes. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst, uint64_t src, size_t size, @@ -144,7 +192,15 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst, return reg_txid; } -/// Initiate an asynchronous 1D DMA transfer and a specific channel. +/** + * @brief Start an asynchronous 1D DMA transfer with native-size pointers on a + * specific channel. + * @param dst The destination pointer. + * @param src The source pointer. + * @param size The size of the transfer in bytes. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src, size_t size, uint32_t channel) { @@ -152,8 +208,20 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src, channel); } -/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a -/// specific channel. +/** + * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers on a + * specific channel. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr( uint64_t dst, uint64_t src, size_t size, size_t dst_stride, size_t src_stride, size_t repeat, uint32_t channel) { @@ -198,7 +266,20 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr( return reg_txid; } -/// Initiate an asynchronous 2D DMA transfer and a specific channel. +/** + * @brief Start an asynchronous 2D DMA transfer with native-size pointers on a + * specific channel. + * @param dst The destination address. + * @param src The source address. + * @param size The size of every 1D transfer within the 2D transfer in bytes. + * @param dst_stride The offset between consecutive 1D transfers at the + * destination, in bytes. + * @param src_stride The offset between consecutive 1D transfers at the + * source, in bytes. + * @param repeat The number of 1D transfers composing the 2D transfer. + * @param channel The index of the channel. + * @return The DMA transfer ID. + */ inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src, size_t size, size_t dst_stride, size_t src_stride, @@ -209,7 +290,10 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src, channel); } -/// Block until a transfer finishes. +/** + * @brief Block until a DMA transfer finishes. + * @param dst The DMA transfer ID. + */ inline void snrt_dma_wait(snrt_dma_txid_t tid) { // dmstati t0, 0 # 0=status.completed_id asm volatile( @@ -221,7 +305,10 @@ inline void snrt_dma_wait(snrt_dma_txid_t tid) { : "t0"); } -/// Block until a transfer finishes on a specific channel. +/** + * @brief Block until a DMA transfer finishes on a specific channel. + * @param dst The DMA transfer ID. + */ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) { // dmstati t0, 0 # 0=status.completed_id register uint32_t cfg asm("t1") = channel << 2; @@ -235,7 +322,9 @@ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) { : "t0"); } -/// Block until all operation on the DMA ceases. +/** + * @brief Block until all DMA operation ceases. + */ inline void snrt_dma_wait_all() { // dmstati t0, 2 # 2=status.busy asm volatile( @@ -246,7 +335,10 @@ inline void snrt_dma_wait_all() { : "t0"); } -/// Block until all operation on the DMA ceases on a specific channel. +/** + * @brief Block until a specific DMA channel is idle. + * @param channel The index of the channel. + */ inline void snrt_dma_wait_all_channel(uint32_t channel) { register uint32_t tmp; // dmstati t0, 2 # 2=status.busy @@ -260,7 +352,10 @@ inline void snrt_dma_wait_all_channel(uint32_t channel) { : "t0"); } -/// Wait until all channels are idle +/** + * @brief Block until the first @p num_channels channels are idle. + * @param num_channels The number of channels to wait on. + */ inline void snrt_dma_wait_all_channels(uint32_t num_channels) { register uint32_t tmp; // dmstati t0, 2 # 2=status.busy @@ -270,10 +365,10 @@ inline void snrt_dma_wait_all_channels(uint32_t num_channels) { } /** - * @brief start tracking of dma performance region. Does not have any + * @brief Start tracking of dma performance region. Does not have any * implications on the HW. Only injects a marker in the DMA traces that can be - * analyzed - * + * analyzed. + * @deprecated */ inline void snrt_dma_start_tracking() { // dmstati zero, 0 @@ -282,10 +377,10 @@ inline void snrt_dma_start_tracking() { } /** - * @brief stop tracking of dma performance region. Does not have any + * @brief Stop tracking of dma performance region. Does not have any * implications on the HW. Only injects a marker in the DMA traces that can be - * analyzed - * + * analyzed. + * @deprecated */ inline void snrt_dma_stop_tracking() { asm volatile(".word %0\n" ::"i"( @@ -293,11 +388,10 @@ inline void snrt_dma_stop_tracking() { } /** - * @brief fast memset function performed by DMA - * - * @param ptr pointer to the start of the region - * @param value value to set - * @param len number of bytes, must be multiple of DMA bus-width + * @brief Fast memset function performed by DMA. + * @param ptr Pointer to the start of the region. + * @param value Value to set. + * @param len Number of bytes, must be a multiple of the DMA bus width. */ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) { // set first 64bytes to value @@ -314,9 +408,14 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) { snrt_dma_wait_all(); } -/// Load a 1D-tile of size tile_size from a 1D array. The specific tile is -/// selected by tile_idx. Every element in the src and dst arrays has prec -/// bytes. +/** + * @brief Load a tile of a 1D array. + * @param dst Pointer to the tile destination. + * @param src Pointer to the source array. + * @param tile_idx Index of the tile in the 1D array. + * @param tile_size Number of elements within a tile of the 1D array. + * @param prec Number of bytes of each element in the 1D array. + */ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec) { @@ -324,9 +423,14 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src, return snrt_dma_start_1d(dst, src + tile_idx * tile_nbytes, tile_nbytes); } -/// Store a 1D-tile of size tile_size to a 1D array. The specific tile is -/// selected by tile_idx. Every element in the src and dst arrays has prec -/// bytes. +/** + * @brief Store a tile to a 1D array. + * @param dst Pointer to the destination array. + * @param src Pointer to the source tile. + * @param tile_idx Index of the tile in the 1D array. + * @param tile_size Number of elements within a tile of the 1D array. + * @param prec Number of bytes of each element in the 1D array. + */ inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, size_t tile_idx, size_t tile_size, uint32_t prec) { @@ -334,10 +438,20 @@ inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src, return snrt_dma_start_1d(dst + tile_idx * tile_nbytes, src, tile_nbytes); } -/// Load a 2D-tile of shape (tile_x1_size, tile_x0_size) from the 2D array -/// of shape (full_x1_size, full_x0_size). The specific tile is selected -/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and -/// destination arrays has prec bytes. +/** + * @brief Load a 2D tile of a 2D array. + * @param dst Pointer to the tile destination. + * @param src Pointer to the source array. + * @param tile_x1_idx Outermost coordinate of the tile in the 2D array. + * @param tile_x0_idx Innermost coordinate of the tile in the 2D array. + * @param tile_x1_size Number of elements in the outermost dimension of the + * tile. + * @param tile_x0_size Number of elements in the innermost dimension of the + * tile. + * @param full_x0_size Number of elements in the innermost dimension of the + * array. + * @param prec Number of bytes of each element in the 2D array. + */ inline snrt_dma_txid_t snrt_dma_load_2d_tile( void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, @@ -357,10 +471,20 @@ inline snrt_dma_txid_t snrt_dma_load_2d_tile( ); } -/// Store a 2D-tile of shape (tile_x1_size, tile_x0_size) to the 2D array -/// of shape (full_x1_size, full_x0_size). The specific tile is selected -/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and -/// destination arrays has prec bytes. +/** + * @brief Store a 2D tile to a 2D array. + * @param dst Pointer to the destination array. + * @param src Pointer to the source tile. + * @param tile_x1_idx Outermost coordinate of the tile in the 2D array. + * @param tile_x0_idx Innermost coordinate of the tile in the 2D array. + * @param tile_x1_size Number of elements in the outermost dimension of the + * tile. + * @param tile_x0_size Number of elements in the innermost dimension of the + * tile. + * @param full_x0_size Number of elements in the innermost dimension of the + * array. + * @param prec Number of bytes of each element in the 2D array. + */ inline snrt_dma_txid_t snrt_dma_store_2d_tile( void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx, size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size, diff --git a/sw/snRuntime/src/ssr.h b/sw/snRuntime/src/ssr.h index 1a067fea5c..f932fd2ab5 100644 --- a/sw/snRuntime/src/ssr.h +++ b/sw/snRuntime/src/ssr.h @@ -2,9 +2,27 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief This file contains functions to conveniently program Snitch's SSRs. + * + * An SSR stream can be configured to replace a store (or load) sequence as + * could be generated by an N-dimensional affine loop nest: + * @code{.c} + * for (int i = 0; i < b1; i++) + * for (int j = 0; j < b0; j++) + * array[i * s1 + j * s0] = 0; + * @endcode + * + * The configuration functions provided in this file reflect the parameters + * one would define to set up such a loop nest. + */ + #pragma once -/// Synchronize the integer and float pipelines. +/** + * @brief Synchronize the integer and float pipelines. + */ inline void snrt_fpu_fence() { unsigned tmp; asm volatile( @@ -13,34 +31,41 @@ inline void snrt_fpu_fence() { : "+r"(tmp)::"memory"); } -/// The different SSR data movers. +/** + * @brief The different SSRs. + */ enum snrt_ssr_dm { - SNRT_SSR_DM0 = 0, - SNRT_SSR_DM1 = 1, - SNRT_SSR_DM2 = 2, - // To write to all SSRs, use index 31 - SNRT_SSR_DM_ALL = 31, + SNRT_SSR_DM0 = 0, /**< SSR data mover 0 */ + SNRT_SSR_DM1 = 1, /**< SSR data mover 1 */ + SNRT_SSR_DM2 = 2, /**< SSR data mover 2 */ + SNRT_SSR_DM_ALL = 31 /**< Write to all SSRs */ }; -/// The different dimensions. +/** + * @brief The different dimensions. + */ enum snrt_ssr_dim { - SNRT_SSR_1D = 0, - SNRT_SSR_2D = 1, - SNRT_SSR_3D = 2, - SNRT_SSR_4D = 3, + SNRT_SSR_1D = 0, /**< 1D stream */ + SNRT_SSR_2D = 1, /**< 2D stream */ + SNRT_SSR_3D = 2, /**< 3D stream */ + SNRT_SSR_4D = 3 /**< 4D stream */ }; -/// The SSR configuration registers. +/** + * @brief The SSR configuration registers. + */ enum { - REG_STATUS = 0, - REG_REPEAT = 1, - REG_BOUNDS = 2, // + loop index - REG_STRIDES = 6, // + loop index - REG_RPTR = 24, // + snrt_ssr_dim - REG_WPTR = 28, // + snrt_ssr_dim + REG_STATUS = 0, /**< SSR status register */ + REG_REPEAT = 1, /**< SSR repeat register */ + REG_BOUNDS = 2, /**< SSR bounds register */ + REG_STRIDES = 6, /**< SSR strides register */ + REG_RPTR = 24, /**< SSR read pointer register */ + REG_WPTR = 28 /**< SSR write pointer register */ }; -/// Enable SSR. +/** + * @brief Enable all SSRs. + */ inline void snrt_ssr_enable() { #ifdef __TOOLCHAIN_LLVM__ __builtin_ssr_enable(); @@ -49,7 +74,9 @@ inline void snrt_ssr_enable() { #endif } -/// Disable SSR. +/** + * @brief Disable all SSRs. + */ inline void snrt_ssr_disable() { #ifdef __TOOLCHAIN_LLVM__ __builtin_ssr_disable(); @@ -58,20 +85,36 @@ inline void snrt_ssr_disable() { #endif } +/** + * @brief Read the value of an SSR configuration register. + * @param reg The register index. + * @param dm The SSR index. + * @return The value of the register. + */ inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) { + uint32_t value; asm volatile("scfgri %[value], %[dm] | %[reg]<<5\n" : [ value ] "=r"(value) : [ dm ] "i"(dm), [ reg ] "i"(reg)); return value; } - +/** + * @brief Write a value to an SSR configuration register. + * @param reg The register index. + * @param dm The SSR index. + * @param value The value to write. + */ inline void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) { asm volatile("scfgwi %[value], %[dm] | %[reg]<<5\n" ::[value] "r"(value), [ dm ] "i"(dm), [ reg ] "i"(reg)); } - -// Configure an SSR data mover for a 1D loop nest. +/** + * @brief Configure an SSR data mover for a 1D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the loop. + * @param s0 The stride of the loop. + */ inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) { --b0; write_ssr_cfg(REG_BOUNDS + 0, dm, b0); @@ -79,8 +122,14 @@ inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) { write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a); a += s0 * b0; } - -// Configure an SSR data mover for a 2D loop nest. +/** + * @brief Configure an SSR data mover for a 2D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the first loop. + * @param b1 The bound of the second loop. + * @param s0 The stride of the first loop. + * @param s1 The stride of the second loop. + */ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t s0, size_t s1) { --b0; @@ -94,7 +143,16 @@ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1, a += s1 * b1; } -// Configure an SSR data mover for a 3D loop nest. +/** + * @brief Configure an SSR data mover for a 3D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the first loop. + * @param b1 The bound of the second loop. + * @param b2 The bound of the third loop. + * @param s0 The stride of the first loop. + * @param s1 The stride of the second loop. + * @param s2 The stride of the third loop. + */ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t s0, size_t s1, size_t s2) { --b0; @@ -112,10 +170,18 @@ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1, a += s2 * b2; } -// Configure an SSR data mover for a 4D loop nest. -// b0: Inner-most bound (limit of loop) -// b3: Outer-most bound (limit of loop) -// s0: increment size of inner-most loop +/** + * @brief Configure an SSR data mover for a 4D loop nest. + * @param dm The SSR index. + * @param b0 The bound of the first loop. + * @param b1 The bound of the second loop. + * @param b2 The bound of the third loop. + * @param b3 The bound of the fourth loop. + * @param s0 The stride of the first loop. + * @param s1 The stride of the second loop. + * @param s2 The stride of the third loop. + * @param s3 The stride of the fourth loop. + */ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, size_t b2, size_t b3, size_t s0, size_t s1, size_t s2, size_t s3) { @@ -137,19 +203,30 @@ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1, write_ssr_cfg(REG_STRIDES + 3, dm, s3 - a); a += s3 * b3; } - -/// Configure the repetition count for a stream. +/** + * @brief Configure the repetition count for a stream. + * @param dm The SSR index. + * @param count The repetition count. + */ inline void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) { write_ssr_cfg(REG_REPEAT, dm, count - 1); } - -/// Start a streaming read. +/** + * @brief Start a streaming read. + * @param dm The SSR index. + * @param dim The number of dimensions to use. + * @param ptr The pointer to the data. + */ inline void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) { write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr); } - -/// Start a streaming write. +/** + * @brief Start a streaming write. + * @param dm The SSR index. + * @param dim The number of dimensions to use. + * @param ptr The pointer to the data. + */ inline void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim, volatile void *ptr) { write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr); diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h index fa4b75b244..add26fa082 100644 --- a/sw/snRuntime/src/sync.h +++ b/sw/snRuntime/src/sync.h @@ -5,6 +5,11 @@ // Luca Colagrande // Viviane Potocnik +/** + * @file + * @brief This file provides functions to synchronize Snitch cores. + */ + #pragma once #include @@ -13,11 +18,18 @@ // Mutex functions //================================================================================ +/** + * @brief Get a pointer to a mutex variable. + */ inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; } /** - * @brief lock a mutex, blocking - * @details declare mutex with `static volatile uint32_t mtx = 0;` + * @brief Acquire a mutex, blocking. + * @details Test-and-set (TAS) implementation of a lock. + * @param pmtx A pointer to a variable which can be used as a mutex, i.e. to + * which all cores have a reference and at a memory location to + * which atomic accesses can be made. This can be declared e.g. as + * `static volatile uint32_t mtx = 0;`. */ inline void snrt_mutex_acquire(volatile uint32_t *pmtx) { asm volatile( @@ -31,9 +43,9 @@ inline void snrt_mutex_acquire(volatile uint32_t *pmtx) { } /** - * @brief lock a mutex, blocking - * @details test and test-and-set (ttas) implementation of a lock. - * Declare mutex with `static volatile uint32_t mtx = 0;` + * @brief Acquire a mutex, blocking. + * @details Same as @ref snrt_mutex_acquire but acquires the lock using a test + * and test-and-set (TTAS) strategy. */ inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) { asm volatile( @@ -50,7 +62,7 @@ inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) { } /** - * @brief Release the mutex + * @brief Release a previously-acquired mutex. */ inline void snrt_mutex_release(volatile uint32_t *pmtx) { asm volatile("amoswap.w.rl x0,x0,(%0) # Release lock by storing 0\n" @@ -61,13 +73,21 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) { // Barrier functions //================================================================================ -/// Synchronize cores in a cluster with a hardware barrier +/** + * @brief Synchronize cores in a cluster with a hardware barrier, blocking. + * @note Synchronizes all (both DM and compute) cores. All cores must invoke + * this function, or the calling cores will stall indefinitely. + */ inline void snrt_cluster_hw_barrier() { asm volatile("csrr x0, 0x7C2" ::: "memory"); } -// Synchronizes one core from every cluster with the others. -// One core per cluster is expected to invoke this function. +/** + * @brief Synchronize one core from every cluster with the others. + * @details Implemented as a software barrier. + * @note One core per cluster must invoke this function, or the calling cores + * will stall indefinitely. + */ inline void snrt_inter_cluster_barrier() { // Remember previous iteration uint32_t prev_barrier_iteration = _snrt_barrier.iteration; @@ -84,7 +104,15 @@ inline void snrt_inter_cluster_barrier() { } } -/// Synchronize clusters globally with a global software barrier +/** + * @brief Synchronize all Snitch cores. + * @details Synchronization is performed hierarchically. Within a cluster, + * cores are synchronized through a hardware barrier (see + * @ref snrt_cluster_hw_barrier). Clusters are synchronized through + * a software barrier (see @ref snrt_inter_cluster_barrier). + * @note Every Snitch core must invoke this function, or the calling cores + * will stall indefinitely. + */ inline void snrt_global_barrier() { snrt_cluster_hw_barrier(); @@ -96,17 +124,12 @@ inline void snrt_global_barrier() { snrt_cluster_hw_barrier(); } -inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) { - __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED); - snrt_global_barrier(); - return _reduction_result; -} - /** - * @brief Generic barrier - * - * @param barr pointer to a barrier - * @param n number of harts that have to enter before released + * @brief Generic software barrier. + * @param barr pointer to a barrier variable. + * @param n number of harts that have to enter before released. + * @note Exactly the specified number of harts must invoke this function, or + * the calling cores will stall indefinitely. */ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) { // Remember previous iteration @@ -128,8 +151,37 @@ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) { // Reduction functions //================================================================================ -// Assumes the dst and src buffers are at the same offset in the TCDM of every -// cluster +/** + * @brief Perform a global sum reduction, blocking. + * @details All cores participate in the reduction and synchronize globally + * to wait for the reduction to complete. + * The synchronization is performed via @ref snrt_global_barrier. + * @param value The value to be summed. + * @return The result of the sum reduction. + * @note Every Snitch core must invoke this function, or the calling cores + * will stall indefinitely. + */ +inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) { + __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED); + snrt_global_barrier(); + return _reduction_result; +} + +/** + * @brief Perform a sum reduction among clusters, blocking. + * @details The reduction is performed in a logarithmic fashion. Half of the + * clusters active in every level of the binary-tree participate as + * as senders, the other half as receivers. Senders use the DMA to + * send their data to the respective receiver's destination buffer. + * The receiver then reduces each element in its destination buffer + * with the respective element in its source buffer. It then proceeds + * to the next level in the binary tree. + * @param dst_buffer The pointer to the calling cluster's destination buffer. + * @param src_buffer The pointer to the calling cluster's source buffer. + * @param len The amount of data in each buffer. + * @note The destination buffers must lie at the same offset in every cluster's + * TCDM. + */ inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer, size_t len) { // If we have a single cluster the reduction degenerates to a memcpy diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h index eb06a4488a..ebdaf34d55 100644 --- a/sw/snRuntime/src/team.h +++ b/sw/snRuntime/src/team.h @@ -2,67 +2,151 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +/** + * @file + * @brief This file contains functions and macros related to Snitch team management. + * + * The functions in this file provide information about the Snitch hardware configuration, + * such as the number of clusters, cores per cluster, and the current core's index within + * the system. These functions can be used for team management and core-specific operations. + */ + #pragma once +/** + * @brief Get the RISC-V hardware thread ID (hartid). + * + * @return The hardware thread ID. + */ inline uint32_t __attribute__((const)) snrt_hartid() { uint32_t hartid; asm("csrr %0, mhartid" : "=r"(hartid)); return hartid; } +/** + * @brief Get the number of Snitch clusters in the system. + * + * @return The number of clusters. + */ inline uint32_t __attribute__((const)) snrt_cluster_num() { return SNRT_CLUSTER_NUM; } +/** + * @brief Get the number of cores per cluster. + * + * @return The number of cores per cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_core_num() { return SNRT_CLUSTER_CORE_NUM; } +/** + * @brief Get the hartid of the first Snitch core in the system. + * + * @return The hartid of the first Snitch core in the system. + */ inline uint32_t __attribute__((const)) snrt_global_core_base_hartid() { return SNRT_BASE_HARTID; } +/** + * @brief Get the total number of Snitch cores in the system. + * + * @return The total number of cores. + */ inline uint32_t __attribute__((const)) snrt_global_core_num() { return snrt_cluster_num() * snrt_cluster_core_num(); } +/** + * @brief Get the total number of Snitch compute cores in the system. + * + * @return The total number of compute cores. + */ inline uint32_t __attribute__((const)) snrt_global_compute_core_num() { return snrt_cluster_num() * snrt_cluster_compute_core_num(); } +/** + * @brief Get the index (!= hartid) of the current Snitch core in the system. + * + * @return The index of the current Snitch core. + */ inline uint32_t __attribute__((const)) snrt_global_core_idx() { return snrt_hartid() - snrt_global_core_base_hartid(); } +/** + * @brief Get the index of the current Snitch compute core in the system. + * + * @return The index of the current Snitch compute core. + */ inline uint32_t __attribute__((const)) snrt_global_compute_core_idx() { return snrt_cluster_idx() * snrt_cluster_compute_core_num() + snrt_cluster_core_idx(); } +/** + * @brief Get the index of the current Snitch cluster in the system. + * + * @return The index of the current cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_idx() { return snrt_global_core_idx() / snrt_cluster_core_num(); } +/** + * @brief Get the index of the current Snitch core within the cluster. + * + * @return The index of the current core within the cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_core_idx() { return snrt_global_core_idx() % snrt_cluster_core_num(); } +/** + * @brief Get the number of data mover (DM) cores per cluster. + * + * @return The number of DM cores per cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_dm_core_num() { return SNRT_CLUSTER_DM_CORE_NUM; } +/** + * @brief Get the number of compute cores per cluster. + * + * @return The number of compute cores per cluster. + */ inline uint32_t __attribute__((const)) snrt_cluster_compute_core_num() { return snrt_cluster_core_num() - snrt_cluster_dm_core_num(); } +/** + * @brief Check if the current core is a compute core. + * + * @return True if the current core is a compute core, false otherwise. + */ inline int __attribute__((const)) snrt_is_compute_core() { return snrt_cluster_core_idx() < snrt_cluster_compute_core_num(); } +/** + * @brief Check if the current core is the last compute core in the cluster. + * + * @return True if the current core is the last compute core, false otherwise. + */ inline int __attribute__((const)) snrt_cluster_is_last_compute_core() { return snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1); } +/** + * @brief Check if the current core is a data mover (DM) core. + * + * @return True if the current core is a DM core, false otherwise. + */ inline int __attribute__((const)) snrt_is_dm_core() { return !snrt_is_compute_core(); }