diff --git a/sw/snRuntime/src/alloc_v2.h b/sw/snRuntime/src/alloc_v2.h
index 29ffb81e52..db6a2916bf 100644
--- a/sw/snRuntime/src/alloc_v2.h
+++ b/sw/snRuntime/src/alloc_v2.h
@@ -2,29 +2,61 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief Defines functions to dynamically allocate the cluster's L1 memory.
+ * 
+ * This file provides functions to dynamically allocate the cluster's L1
+ * memory. It includes functions for allocating memory for cluster-local
+ * variables, compute core-local variables, and for manipulating pointers to
+ * variables allocated by different cores or clusters.
+ */
+
 extern __thread snrt_allocator_t l1_allocator_v2;
 
+/**
+ * @brief Get a pointer to the L1 allocator.
+ * 
+ * @return Pointer to the L1 allocator.
+ */
 inline snrt_allocator_t *snrt_l1_allocator_v2() { return &l1_allocator_v2; }
 
+/**
+ * @brief Get the next pointer of the L1 allocator.
+ * 
+ * @return The next pointer of the L1 allocator.
+ */
 inline void *snrt_l1_next_v2() { return (void *)snrt_l1_allocator_v2()->next; }
 
 /**
- * @brief Override the L1 allocator next pointer
+ * @brief Override the L1 allocator next pointer.
+ * 
+ * @param next The new value for the next pointer.
  */
 inline void snrt_l1_update_next_v2(void *next) {
     snrt_l1_allocator_v2()->next = (uint32_t)next;
 }
 
-// Check that allocation doesn't exceed allocator bounds, and raise an
-// exception otherwise
+/**
+ * @brief Check if the allocation exceeds the allocator bounds and raise an
+ *        exception if it does.
+ */
 inline void snrt_l1_alloc_check_bounds() {
     if (snrt_l1_allocator_v2()->next > snrt_l1_allocator_v2()->end)
         asm volatile("ecall \n");
 }
 
-// Dynamically allocate space for a variable of size `size` in the cluster's L1
-// memory. This function should be invoked by every core in a cluster. Every
-// core receives a pointer to the allocated variable.
+/**
+ * @brief Allocate space for a variable in the cluster's L1 memory.
+ * 
+ * This function dynamically allocates space for a variable of size `size` in
+ * the cluster's L1 memory.
+ * The allocation is aligned to the specified `alignment`.
+ * 
+ * @param size The size of the variable to allocate.
+ * @param alignment The alignment of the allocation.
+ * @return Pointer to the allocated variable.
+ */
 inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
     snrt_l1_allocator_v2()->next =
         ALIGN_UP(snrt_l1_allocator_v2()->next, alignment);
@@ -34,11 +66,19 @@ inline void *snrt_l1_alloc_cluster_local(size_t size, const size_t alignment) {
     return retval;
 }
 
-// Dynamically allocate space for N variables of size `size` in the cluster's
-// L1 memory, N being the number of compute cores in the cluster. This function
-// should be invoked by every core in a cluster. Every compute core receives a
-// pointer to a unique variable among the N which have been allocated. The
-// return value for the DM core is undefined.
+/**
+ * @brief Allocate space for N variables in the cluster's L1 memory.
+ * 
+ * This function dynamically allocates space for N variables of size `size` in
+ * the cluster's L1 memory, where N is the number of compute cores in the
+ * cluster. The variables are allocated in a contiguous block of memory.
+ * The whole block is aligned to the specified `alignment`.
+ * 
+ * @param size The size of each variable to allocate.
+ * @param alignment The alignment of the allocation.
+ * @return Pointer to the allocated variable for each compute core.
+ *         The return value for the DM core is undefined.
+ */
 inline void *snrt_l1_alloc_compute_core_local(size_t size,
                                               const size_t alignment) {
     snrt_l1_allocator_v2()->next =
@@ -49,24 +89,52 @@ inline void *snrt_l1_alloc_compute_core_local(size_t size,
     return retval;
 }
 
-// Takes a pointer to a variable allocated using
-// `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
-// variable allocated by another core, as specified by `core_idx`.
-// The `size` argument should be the same used during allocation.
+/**
+ * @brief Get a pointer to the same variable allocated by another core.
+ * 
+ * This function takes a pointer to a variable allocated using
+ * `snrt_l1_alloc_compute_core_local` and returns a pointer to the same
+ * variable allocated by another core, as specified by `core_idx`.
+ * The `size` argument should be the same used during allocation.
+ * 
+ * @param ptr Pointer to the variable allocated by the current core.
+ * @param core_idx Index of the core that allocated the variable.
+ * @param size The size of the variable.
+ * @return Pointer to the same variable allocated by the specified core.
+ */
 inline void *snrt_compute_core_local_ptr(void *ptr, uint32_t core_idx,
                                          size_t size) {
     size_t offset = (core_idx - snrt_cluster_core_idx()) * size;
     return (void *)((uintptr_t)ptr + offset);
 }
 
-// Takes a pointer to a variable in the source cluster's L1 memory and returns
-// a pointer to the same offset in the destination cluster's L1 memory.
+/**
+ * @brief Get a pointer to the same offset in another cluster's L1 memory.
+ * 
+ * This function takes a pointer to a variable in the calling (source)
+ * cluster's L1 memory and returns a pointer to the same offset in the target
+ * (destination) cluster's L1 memory.
+ * 
+ * @param ptr Pointer to the variable in the source cluster's L1 memory.
+ * @param src_cluster_idx Index of the source cluster.
+ * @param dst_cluster_idx Index of the destination cluster.
+ * @return Pointer to the same offset in the destination cluster's L1 memory.
+ */
 inline void *snrt_remote_l1_ptr(void *ptr, uint32_t src_cluster_idx,
                                 uint32_t dst_cluster_idx) {
     return (void *)((uintptr_t)ptr +
                     (dst_cluster_idx - src_cluster_idx) * SNRT_CLUSTER_OFFSET);
 }
 
+/**
+ * @brief Initialize the L1 allocator.
+ * 
+ * This function initializes the L1 allocator by calculating the end address
+ * of the heap and setting the base, end, and next pointers of the allocator.
+ * 
+ * @note This function should be called before using any of the allocation
+ *       functions.
+ */
 inline void snrt_alloc_init_v2() {
     // Calculate end address of the heap. The top of the TCDM address space is
     // reserved for the cluster-local storage (CLS) and the stack of every
diff --git a/sw/snRuntime/src/dma.h b/sw/snRuntime/src/dma.h
index a4b82a7d32..e8147be7d0 100644
--- a/sw/snRuntime/src/dma.h
+++ b/sw/snRuntime/src/dma.h
@@ -2,6 +2,11 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief This file provides functions to program the Snitch DMA.
+ */
+
 #pragma once
 
 #define OP_CUSTOM1 0b0101011
@@ -19,9 +24,17 @@
     ((funct7 << 25) | (rs2 << 20) | (rs1 << 15) | (funct3 << 12) | (rd << 7) | \
      (opcode))
 
+
 /// A DMA transfer identifier.
 typedef uint32_t snrt_dma_txid_t;
 
+/**
+ * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of the transfer in bytes.
+ * @return The DMA transfer ID.
+ */
 inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
                                           size_t size) {
     register uint32_t reg_dst_low asm("a0") = dst >> 0;    // 10
@@ -51,13 +64,30 @@ inline uint32_t snrt_dma_start_1d_wideptr(uint64_t dst, uint64_t src,
     return reg_txid;
 }
 
-/// Initiate an asynchronous 1D DMA transfer.
+/**
+ * @brief Start an asynchronous 1D DMA transfer with native-size pointers.
+ * @param dst The destination pointer.
+ * @param src The source pointer.
+ * @param size The size of the transfer in bytes.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_1d(void *dst, const void *src,
                                          size_t size) {
     return snrt_dma_start_1d_wideptr((size_t)dst, (size_t)src, size);
 }
 
-/// Initiate an asynchronous 2D DMA transfer with wide 64-bit pointers.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
                                                  size_t size, size_t dst_stride,
                                                  size_t src_stride,
@@ -102,7 +132,18 @@ inline snrt_dma_txid_t snrt_dma_start_2d_wideptr(uint64_t dst, uint64_t src,
     return reg_txid;
 }
 
-/// Initiate an asynchronous 2D DMA transfer.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with native-size pointers.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src,
                                          size_t size, size_t dst_stride,
                                          size_t src_stride, size_t repeat) {
@@ -110,8 +151,15 @@ inline snrt_dma_txid_t snrt_dma_start_2d(void *dst, const void *src,
                                      src_stride, repeat);
 }
 
-/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a
-/// specific channel.
+/**
+ * @brief Start an asynchronous 1D DMA transfer with 64-bit wide pointers on a
+ *        specific channel.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of the transfer in bytes.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst,
                                                          uint64_t src,
                                                          size_t size,
@@ -144,7 +192,15 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel_wideptr(uint64_t dst,
     return reg_txid;
 }
 
-/// Initiate an asynchronous 1D DMA transfer and a specific channel.
+/**
+ * @brief Start an asynchronous 1D DMA transfer with native-size pointers on a
+ *        specific channel.
+ * @param dst The destination pointer.
+ * @param src The source pointer.
+ * @param size The size of the transfer in bytes.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src,
                                                  size_t size,
                                                  uint32_t channel) {
@@ -152,8 +208,20 @@ inline snrt_dma_txid_t snrt_dma_start_1d_channel(void *dst, const void *src,
                                              channel);
 }
 
-/// Initiate an asynchronous 1D DMA transfer with wide 64-bit pointers and a
-/// specific channel.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with 64-bit wide pointers on a
+ *        specific channel.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr(
     uint64_t dst, uint64_t src, size_t size, size_t dst_stride,
     size_t src_stride, size_t repeat, uint32_t channel) {
@@ -198,7 +266,20 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel_wideptr(
     return reg_txid;
 }
 
-/// Initiate an asynchronous 2D DMA transfer and a specific channel.
+/**
+ * @brief Start an asynchronous 2D DMA transfer with native-size pointers on a
+ *        specific channel.
+ * @param dst The destination address.
+ * @param src The source address.
+ * @param size The size of every 1D transfer within the 2D transfer in bytes.
+ * @param dst_stride The offset between consecutive 1D transfers at the
+ *                   destination, in bytes.
+ * @param src_stride The offset between consecutive 1D transfers at the
+ *                   source, in bytes.
+ * @param repeat The number of 1D transfers composing the 2D transfer.
+ * @param channel The index of the channel.
+ * @return The DMA transfer ID.
+ */
 inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src,
                                                  size_t size, size_t dst_stride,
                                                  size_t src_stride,
@@ -209,7 +290,10 @@ inline snrt_dma_txid_t snrt_dma_start_2d_channel(void *dst, const void *src,
                                              channel);
 }
 
-/// Block until a transfer finishes.
+/**
+ * @brief Block until a DMA transfer finishes.
+ * @param dst The DMA transfer ID.
+ */
 inline void snrt_dma_wait(snrt_dma_txid_t tid) {
     // dmstati t0, 0  # 0=status.completed_id
     asm volatile(
@@ -221,7 +305,10 @@ inline void snrt_dma_wait(snrt_dma_txid_t tid) {
         : "t0");
 }
 
-/// Block until a transfer finishes on a specific channel.
+/**
+ * @brief Block until a DMA transfer finishes on a specific channel.
+ * @param dst The DMA transfer ID.
+ */
 inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) {
     // dmstati t0, 0  # 0=status.completed_id
     register uint32_t cfg asm("t1") = channel << 2;
@@ -235,7 +322,9 @@ inline void snrt_dma_wait_channel(snrt_dma_txid_t tid, uint32_t channel) {
         : "t0");
 }
 
-/// Block until all operation on the DMA ceases.
+/**
+ * @brief Block until all DMA operation ceases.
+ */
 inline void snrt_dma_wait_all() {
     // dmstati t0, 2  # 2=status.busy
     asm volatile(
@@ -246,7 +335,10 @@ inline void snrt_dma_wait_all() {
         : "t0");
 }
 
-/// Block until all operation on the DMA ceases on a specific channel.
+/**
+ * @brief Block until a specific DMA channel is idle.
+ * @param channel The index of the channel.
+ */
 inline void snrt_dma_wait_all_channel(uint32_t channel) {
     register uint32_t tmp;
     // dmstati t0, 2  # 2=status.busy
@@ -260,7 +352,10 @@ inline void snrt_dma_wait_all_channel(uint32_t channel) {
         : "t0");
 }
 
-/// Wait until all channels are idle
+/**
+ * @brief Block until the first @p num_channels channels are idle.
+ * @param num_channels The number of channels to wait on.
+ */
 inline void snrt_dma_wait_all_channels(uint32_t num_channels) {
     register uint32_t tmp;
     // dmstati t0, 2  # 2=status.busy
@@ -270,10 +365,10 @@ inline void snrt_dma_wait_all_channels(uint32_t num_channels) {
 }
 
 /**
- * @brief start tracking of dma performance region. Does not have any
+ * @brief Start tracking of dma performance region. Does not have any
  * implications on the HW. Only injects a marker in the DMA traces that can be
- * analyzed
- *
+ * analyzed.
+ * @deprecated
  */
 inline void snrt_dma_start_tracking() {
     // dmstati zero, 0
@@ -282,10 +377,10 @@ inline void snrt_dma_start_tracking() {
 }
 
 /**
- * @brief stop tracking of dma performance region. Does not have any
+ * @brief Stop tracking of dma performance region. Does not have any
  * implications on the HW. Only injects a marker in the DMA traces that can be
- * analyzed
- *
+ * analyzed.
+ * @deprecated
  */
 inline void snrt_dma_stop_tracking() {
     asm volatile(".word %0\n" ::"i"(
@@ -293,11 +388,10 @@ inline void snrt_dma_stop_tracking() {
 }
 
 /**
- * @brief fast memset function performed by DMA
- *
- * @param ptr pointer to the start of the region
- * @param value value to set
- * @param len number of bytes, must be multiple of DMA bus-width
+ * @brief Fast memset function performed by DMA.
+ * @param ptr Pointer to the start of the region.
+ * @param value Value to set.
+ * @param len Number of bytes, must be a multiple of the DMA bus width.
  */
 inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
     // set first 64bytes to value
@@ -314,9 +408,14 @@ inline void snrt_dma_memset(void *ptr, uint8_t value, uint32_t len) {
     snrt_dma_wait_all();
 }
 
-/// Load a 1D-tile of size tile_size from a 1D array. The specific tile is
-/// selected by tile_idx. Every element in the src and dst arrays has prec
-/// bytes.
+/**
+ * @brief Load a tile of a 1D array.
+ * @param dst Pointer to the tile destination.
+ * @param src Pointer to the source array.
+ * @param tile_idx Index of the tile in the 1D array.
+ * @param tile_size Number of elements within a tile of the 1D array.
+ * @param prec Number of bytes of each element in the 1D array.
+ */
 inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
                                              size_t tile_idx, size_t tile_size,
                                              uint32_t prec) {
@@ -324,9 +423,14 @@ inline snrt_dma_txid_t snrt_dma_load_1d_tile(void *dst, void *src,
     return snrt_dma_start_1d(dst, src + tile_idx * tile_nbytes, tile_nbytes);
 }
 
-/// Store a 1D-tile of size tile_size to a 1D array. The specific tile is
-/// selected by tile_idx. Every element in the src and dst arrays has prec
-/// bytes.
+/**
+ * @brief Store a tile to a 1D array.
+ * @param dst Pointer to the destination array.
+ * @param src Pointer to the source tile.
+ * @param tile_idx Index of the tile in the 1D array.
+ * @param tile_size Number of elements within a tile of the 1D array.
+ * @param prec Number of bytes of each element in the 1D array.
+ */
 inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
                                               size_t tile_idx, size_t tile_size,
                                               uint32_t prec) {
@@ -334,10 +438,20 @@ inline snrt_dma_txid_t snrt_dma_store_1d_tile(void *dst, void *src,
     return snrt_dma_start_1d(dst + tile_idx * tile_nbytes, src, tile_nbytes);
 }
 
-/// Load a 2D-tile of shape (tile_x1_size, tile_x0_size) from the 2D array
-/// of shape (full_x1_size, full_x0_size). The specific tile is selected
-/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and
-/// destination arrays has prec bytes.
+/**
+ * @brief Load a 2D tile of a 2D array.
+ * @param dst Pointer to the tile destination.
+ * @param src Pointer to the source array.
+ * @param tile_x1_idx Outermost coordinate of the tile in the 2D array.
+ * @param tile_x0_idx Innermost coordinate of the tile in the 2D array.
+ * @param tile_x1_size Number of elements in the outermost dimension of the
+ *                     tile.
+ * @param tile_x0_size Number of elements in the innermost dimension of the
+ *                     tile.
+ * @param full_x0_size Number of elements in the innermost dimension of the
+ *                     array.
+ * @param prec Number of bytes of each element in the 2D array.
+ */
 inline snrt_dma_txid_t snrt_dma_load_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
@@ -357,10 +471,20 @@ inline snrt_dma_txid_t snrt_dma_load_2d_tile(
     );
 }
 
-/// Store a 2D-tile of shape (tile_x1_size, tile_x0_size) to the 2D array
-/// of shape (full_x1_size, full_x0_size). The specific tile is selected
-/// by the (tile_x1_idx, tile_x0_idx) tuple. Every element in the src and
-/// destination arrays has prec bytes.
+/**
+ * @brief Store a 2D tile to a 2D array.
+ * @param dst Pointer to the destination array.
+ * @param src Pointer to the source tile.
+ * @param tile_x1_idx Outermost coordinate of the tile in the 2D array.
+ * @param tile_x0_idx Innermost coordinate of the tile in the 2D array.
+ * @param tile_x1_size Number of elements in the outermost dimension of the
+ *                     tile.
+ * @param tile_x0_size Number of elements in the innermost dimension of the
+ *                     tile.
+ * @param full_x0_size Number of elements in the innermost dimension of the
+ *                     array.
+ * @param prec Number of bytes of each element in the 2D array.
+ */
 inline snrt_dma_txid_t snrt_dma_store_2d_tile(
     void *dst, void *src, size_t tile_x1_idx, size_t tile_x0_idx,
     size_t tile_x1_size, size_t tile_x0_size, size_t full_x0_size,
diff --git a/sw/snRuntime/src/ssr.h b/sw/snRuntime/src/ssr.h
index 1a067fea5c..f932fd2ab5 100644
--- a/sw/snRuntime/src/ssr.h
+++ b/sw/snRuntime/src/ssr.h
@@ -2,9 +2,27 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief This file contains functions to conveniently program Snitch's SSRs.
+ *
+ * An SSR stream can be configured to replace a store (or load) sequence as
+ * could be generated by an N-dimensional affine loop nest:
+ * @code{.c}
+ * for (int i = 0; i < b1; i++)
+ *     for (int j = 0; j < b0; j++)
+ *         array[i * s1 + j * s0] = 0;
+ * @endcode
+ *
+ * The configuration functions provided in this file reflect the parameters
+ * one would define to set up such a loop nest.
+ */
+
 #pragma once
 
-/// Synchronize the integer and float pipelines.
+/**
+ * @brief Synchronize the integer and float pipelines.
+ */
 inline void snrt_fpu_fence() {
     unsigned tmp;
     asm volatile(
@@ -13,34 +31,41 @@ inline void snrt_fpu_fence() {
         : "+r"(tmp)::"memory");
 }
 
-/// The different SSR data movers.
+/**
+ * @brief The different SSRs.
+ */
 enum snrt_ssr_dm {
-    SNRT_SSR_DM0 = 0,
-    SNRT_SSR_DM1 = 1,
-    SNRT_SSR_DM2 = 2,
-    // To write to all SSRs, use index 31
-    SNRT_SSR_DM_ALL = 31,
+    SNRT_SSR_DM0 = 0, /**< SSR data mover 0 */
+    SNRT_SSR_DM1 = 1, /**< SSR data mover 1 */
+    SNRT_SSR_DM2 = 2, /**< SSR data mover 2 */
+    SNRT_SSR_DM_ALL = 31 /**< Write to all SSRs */
 };
 
-/// The different dimensions.
+/**
+ * @brief The different dimensions.
+ */
 enum snrt_ssr_dim {
-    SNRT_SSR_1D = 0,
-    SNRT_SSR_2D = 1,
-    SNRT_SSR_3D = 2,
-    SNRT_SSR_4D = 3,
+    SNRT_SSR_1D = 0, /**< 1D stream */
+    SNRT_SSR_2D = 1, /**< 2D stream */
+    SNRT_SSR_3D = 2, /**< 3D stream */
+    SNRT_SSR_4D = 3 /**< 4D stream */
 };
 
-/// The SSR configuration registers.
+/**
+ * @brief The SSR configuration registers.
+ */
 enum {
-    REG_STATUS = 0,
-    REG_REPEAT = 1,
-    REG_BOUNDS = 2,   // + loop index
-    REG_STRIDES = 6,  // + loop index
-    REG_RPTR = 24,    // + snrt_ssr_dim
-    REG_WPTR = 28,    // + snrt_ssr_dim
+    REG_STATUS = 0, /**< SSR status register */
+    REG_REPEAT = 1, /**< SSR repeat register */
+    REG_BOUNDS = 2, /**< SSR bounds register */
+    REG_STRIDES = 6, /**< SSR strides register */
+    REG_RPTR = 24, /**< SSR read pointer register */
+    REG_WPTR = 28 /**< SSR write pointer register */
 };
 
-/// Enable SSR.
+/**
+ * @brief Enable all SSRs.
+ */
 inline void snrt_ssr_enable() {
 #ifdef __TOOLCHAIN_LLVM__
     __builtin_ssr_enable();
@@ -49,7 +74,9 @@ inline void snrt_ssr_enable() {
 #endif
 }
 
-/// Disable SSR.
+/**
+ * @brief Disable all SSRs.
+ */
 inline void snrt_ssr_disable() {
 #ifdef __TOOLCHAIN_LLVM__
     __builtin_ssr_disable();
@@ -58,20 +85,36 @@ inline void snrt_ssr_disable() {
 #endif
 }
 
+/**
+ * @brief Read the value of an SSR configuration register.
+ * @param reg The register index.
+ * @param dm The SSR index.
+ * @return The value of the register.
+ */
 inline uint32_t read_ssr_cfg(uint32_t reg, uint32_t dm) {
+
     uint32_t value;
     asm volatile("scfgri %[value], %[dm] | %[reg]<<5\n"
                  : [ value ] "=r"(value)
                  : [ dm ] "i"(dm), [ reg ] "i"(reg));
     return value;
 }
-
+/**
+ * @brief Write a value to an SSR configuration register.
+ * @param reg The register index.
+ * @param dm The SSR index.
+ * @param value The value to write.
+ */
 inline void write_ssr_cfg(uint32_t reg, uint32_t dm, uint32_t value) {
     asm volatile("scfgwi %[value], %[dm] | %[reg]<<5\n" ::[value] "r"(value),
                  [ dm ] "i"(dm), [ reg ] "i"(reg));
 }
-
-// Configure an SSR data mover for a 1D loop nest.
+/**
+ * @brief Configure an SSR data mover for a 1D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the loop.
+ * @param s0 The stride of the loop.
+ */
 inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) {
     --b0;
     write_ssr_cfg(REG_BOUNDS + 0, dm, b0);
@@ -79,8 +122,14 @@ inline void snrt_ssr_loop_1d(enum snrt_ssr_dm dm, size_t b0, size_t s0) {
     write_ssr_cfg(REG_STRIDES + 0, dm, s0 - a);
     a += s0 * b0;
 }
-
-// Configure an SSR data mover for a 2D loop nest.
+/**
+ * @brief Configure an SSR data mover for a 2D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the first loop.
+ * @param b1 The bound of the second loop.
+ * @param s0 The stride of the first loop.
+ * @param s1 The stride of the second loop.
+ */
 inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t s0, size_t s1) {
     --b0;
@@ -94,7 +143,16 @@ inline void snrt_ssr_loop_2d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     a += s1 * b1;
 }
 
-// Configure an SSR data mover for a 3D loop nest.
+/**
+ * @brief Configure an SSR data mover for a 3D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the first loop.
+ * @param b1 The bound of the second loop.
+ * @param b2 The bound of the third loop.
+ * @param s0 The stride of the first loop.
+ * @param s1 The stride of the second loop.
+ * @param s2 The stride of the third loop.
+ */
 inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t b2, size_t s0, size_t s1, size_t s2) {
     --b0;
@@ -112,10 +170,18 @@ inline void snrt_ssr_loop_3d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     a += s2 * b2;
 }
 
-// Configure an SSR data mover for a 4D loop nest.
-// b0: Inner-most bound (limit of loop)
-// b3: Outer-most bound (limit of loop)
-// s0: increment size of inner-most loop
+/**
+ * @brief Configure an SSR data mover for a 4D loop nest.
+ * @param dm The SSR index.
+ * @param b0 The bound of the first loop.
+ * @param b1 The bound of the second loop.
+ * @param b2 The bound of the third loop.
+ * @param b3 The bound of the fourth loop.
+ * @param s0 The stride of the first loop.
+ * @param s1 The stride of the second loop.
+ * @param s2 The stride of the third loop.
+ * @param s3 The stride of the fourth loop.
+ */
 inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
                              size_t b2, size_t b3, size_t s0, size_t s1,
                              size_t s2, size_t s3) {
@@ -137,19 +203,30 @@ inline void snrt_ssr_loop_4d(enum snrt_ssr_dm dm, size_t b0, size_t b1,
     write_ssr_cfg(REG_STRIDES + 3, dm, s3 - a);
     a += s3 * b3;
 }
-
-/// Configure the repetition count for a stream.
+/**
+ * @brief Configure the repetition count for a stream.
+ * @param dm The SSR index.
+ * @param count The repetition count.
+ */
 inline void snrt_ssr_repeat(enum snrt_ssr_dm dm, size_t count) {
     write_ssr_cfg(REG_REPEAT, dm, count - 1);
 }
-
-/// Start a streaming read.
+/**
+ * @brief Start a streaming read.
+ * @param dm The SSR index.
+ * @param dim The number of dimensions to use.
+ * @param ptr The pointer to the data.
+ */
 inline void snrt_ssr_read(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
                           volatile void *ptr) {
     write_ssr_cfg(REG_RPTR + dim, dm, (uintptr_t)ptr);
 }
-
-/// Start a streaming write.
+/**
+ * @brief Start a streaming write.
+ * @param dm The SSR index.
+ * @param dim The number of dimensions to use.
+ * @param ptr The pointer to the data.
+ */
 inline void snrt_ssr_write(enum snrt_ssr_dm dm, enum snrt_ssr_dim dim,
                            volatile void *ptr) {
     write_ssr_cfg(REG_WPTR + dim, dm, (uintptr_t)ptr);
diff --git a/sw/snRuntime/src/sync.h b/sw/snRuntime/src/sync.h
index fa4b75b244..add26fa082 100644
--- a/sw/snRuntime/src/sync.h
+++ b/sw/snRuntime/src/sync.h
@@ -5,6 +5,11 @@
 // Luca Colagrande <colluca@iis.ee.ethz.ch>
 // Viviane Potocnik <vivianep@iis.ee.ethz.ch>
 
+/**
+ * @file
+ * @brief This file provides functions to synchronize Snitch cores.
+ */
+
 #pragma once
 
 #include <math.h>
@@ -13,11 +18,18 @@
 // Mutex functions
 //================================================================================
 
+/**
+ * @brief Get a pointer to a mutex variable.
+ */
 inline volatile uint32_t *snrt_mutex() { return &_snrt_mutex; }
 
 /**
- * @brief lock a mutex, blocking
- * @details declare mutex with `static volatile uint32_t mtx = 0;`
+ * @brief Acquire a mutex, blocking.
+ * @details Test-and-set (TAS) implementation of a lock.
+ * @param pmtx A pointer to a variable which can be used as a mutex, i.e. to
+ *             which all cores have a reference and at a memory location to
+ *             which atomic accesses can be made. This can be declared e.g. as
+ *             `static volatile uint32_t mtx = 0;`.
  */
 inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
     asm volatile(
@@ -31,9 +43,9 @@ inline void snrt_mutex_acquire(volatile uint32_t *pmtx) {
 }
 
 /**
- * @brief lock a mutex, blocking
- * @details test and test-and-set (ttas) implementation of a lock.
- *          Declare mutex with `static volatile uint32_t mtx = 0;`
+ * @brief Acquire a mutex, blocking.
+ * @details Same as @ref snrt_mutex_acquire but acquires the lock using a test
+ *          and test-and-set (TTAS) strategy.
  */
 inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
     asm volatile(
@@ -50,7 +62,7 @@ inline void snrt_mutex_ttas_acquire(volatile uint32_t *pmtx) {
 }
 
 /**
- * @brief Release the mutex
+ * @brief Release a previously-acquired mutex.
  */
 inline void snrt_mutex_release(volatile uint32_t *pmtx) {
     asm volatile("amoswap.w.rl  x0,x0,(%0)   # Release lock by storing 0\n"
@@ -61,13 +73,21 @@ inline void snrt_mutex_release(volatile uint32_t *pmtx) {
 // Barrier functions
 //================================================================================
 
-/// Synchronize cores in a cluster with a hardware barrier
+/**
+ * @brief Synchronize cores in a cluster with a hardware barrier, blocking.
+ * @note Synchronizes all (both DM and compute) cores. All cores must invoke
+ *       this function, or the calling cores will stall indefinitely.
+ */
 inline void snrt_cluster_hw_barrier() {
     asm volatile("csrr x0, 0x7C2" ::: "memory");
 }
 
-// Synchronizes one core from every cluster with the others.
-// One core per cluster is expected to invoke this function.
+/**
+ * @brief Synchronize one core from every cluster with the others.
+ * @details Implemented as a software barrier.
+ * @note One core per cluster must invoke this function, or the calling cores
+ *       will stall indefinitely.
+ */
 inline void snrt_inter_cluster_barrier() {
     // Remember previous iteration
     uint32_t prev_barrier_iteration = _snrt_barrier.iteration;
@@ -84,7 +104,15 @@ inline void snrt_inter_cluster_barrier() {
     }
 }
 
-/// Synchronize clusters globally with a global software barrier
+/**
+ * @brief Synchronize all Snitch cores.
+ * @details Synchronization is performed hierarchically. Within a cluster,
+ *          cores are synchronized through a hardware barrier (see
+ *          @ref snrt_cluster_hw_barrier). Clusters are synchronized through
+ *          a software barrier (see @ref snrt_inter_cluster_barrier).
+ * @note Every Snitch core must invoke this function, or the calling cores
+ *       will stall indefinitely.
+ */
 inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 
@@ -96,17 +124,12 @@ inline void snrt_global_barrier() {
     snrt_cluster_hw_barrier();
 }
 
-inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
-    __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED);
-    snrt_global_barrier();
-    return _reduction_result;
-}
-
 /**
- * @brief Generic barrier
- *
- * @param barr pointer to a barrier
- * @param n number of harts that have to enter before released
+ * @brief Generic software barrier.
+ * @param barr pointer to a barrier variable.
+ * @param n number of harts that have to enter before released.
+ * @note Exactly the specified number of harts must invoke this function, or
+ *       the calling cores will stall indefinitely.
  */
 inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
     // Remember previous iteration
@@ -128,8 +151,37 @@ inline void snrt_partial_barrier(snrt_barrier_t *barr, uint32_t n) {
 // Reduction functions
 //================================================================================
 
-// Assumes the dst and src buffers are at the same offset in the TCDM of every
-// cluster
+/**
+ * @brief Perform a global sum reduction, blocking.
+ * @details All cores participate in the reduction and synchronize globally
+ *          to wait for the reduction to complete.
+ *          The synchronization is performed via @ref snrt_global_barrier.
+ * @param value The value to be summed.
+ * @return The result of the sum reduction.
+ * @note Every Snitch core must invoke this function, or the calling cores
+ *       will stall indefinitely.
+ */
+inline uint32_t snrt_global_all_to_all_reduction(uint32_t value) {
+    __atomic_add_fetch(&_reduction_result, value, __ATOMIC_RELAXED);
+    snrt_global_barrier();
+    return _reduction_result;
+}
+
+/**
+ * @brief Perform a sum reduction among clusters, blocking.
+ * @details The reduction is performed in a logarithmic fashion. Half of the
+ *          clusters active in every level of the binary-tree participate as
+ *          as senders, the other half as receivers. Senders use the DMA to
+ *          send their data to the respective receiver's destination buffer.
+ *          The receiver then reduces each element in its destination buffer
+ *          with the respective element in its source buffer. It then proceeds
+ *          to the next level in the binary tree.
+ * @param dst_buffer The pointer to the calling cluster's destination buffer.
+ * @param src_buffer The pointer to the calling cluster's source buffer.
+ * @param len The amount of data in each buffer.
+ * @note The destination buffers must lie at the same offset in every cluster's
+ *       TCDM.
+ */
 inline void snrt_global_reduction_dma(double *dst_buffer, double *src_buffer,
                                       size_t len) {
     // If we have a single cluster the reduction degenerates to a memcpy
diff --git a/sw/snRuntime/src/team.h b/sw/snRuntime/src/team.h
index eb06a4488a..ebdaf34d55 100644
--- a/sw/snRuntime/src/team.h
+++ b/sw/snRuntime/src/team.h
@@ -2,67 +2,151 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+/**
+ * @file
+ * @brief This file contains functions and macros related to Snitch team management.
+ *
+ * The functions in this file provide information about the Snitch hardware configuration,
+ * such as the number of clusters, cores per cluster, and the current core's index within
+ * the system. These functions can be used for team management and core-specific operations.
+ */
+
 #pragma once
 
+/**
+ * @brief Get the RISC-V hardware thread ID (hartid).
+ *
+ * @return The hardware thread ID.
+ */
 inline uint32_t __attribute__((const)) snrt_hartid() {
     uint32_t hartid;
     asm("csrr %0, mhartid" : "=r"(hartid));
     return hartid;
 }
 
+/**
+ * @brief Get the number of Snitch clusters in the system.
+ *
+ * @return The number of clusters.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_num() {
     return SNRT_CLUSTER_NUM;
 }
 
+/**
+ * @brief Get the number of cores per cluster.
+ *
+ * @return The number of cores per cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_core_num() {
     return SNRT_CLUSTER_CORE_NUM;
 }
 
+/**
+ * @brief Get the hartid of the first Snitch core in the system.
+ *
+ * @return The hartid of the first Snitch core in the system.
+ */
 inline uint32_t __attribute__((const)) snrt_global_core_base_hartid() {
     return SNRT_BASE_HARTID;
 }
 
+/**
+ * @brief Get the total number of Snitch cores in the system.
+ *
+ * @return The total number of cores.
+ */
 inline uint32_t __attribute__((const)) snrt_global_core_num() {
     return snrt_cluster_num() * snrt_cluster_core_num();
 }
 
+/**
+ * @brief Get the total number of Snitch compute cores in the system.
+ *
+ * @return The total number of compute cores.
+ */
 inline uint32_t __attribute__((const)) snrt_global_compute_core_num() {
     return snrt_cluster_num() * snrt_cluster_compute_core_num();
 }
 
+/**
+ * @brief Get the index (!= hartid) of the current Snitch core in the system.
+ *
+ * @return The index of the current Snitch core.
+ */
 inline uint32_t __attribute__((const)) snrt_global_core_idx() {
     return snrt_hartid() - snrt_global_core_base_hartid();
 }
 
+/**
+ * @brief Get the index of the current Snitch compute core in the system.
+ *
+ * @return The index of the current Snitch compute core.
+ */
 inline uint32_t __attribute__((const)) snrt_global_compute_core_idx() {
     return snrt_cluster_idx() * snrt_cluster_compute_core_num() +
            snrt_cluster_core_idx();
 }
 
+/**
+ * @brief Get the index of the current Snitch cluster in the system.
+ *
+ * @return The index of the current cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_idx() {
     return snrt_global_core_idx() / snrt_cluster_core_num();
 }
 
+/**
+ * @brief Get the index of the current Snitch core within the cluster.
+ *
+ * @return The index of the current core within the cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_core_idx() {
     return snrt_global_core_idx() % snrt_cluster_core_num();
 }
 
+/**
+ * @brief Get the number of data mover (DM) cores per cluster.
+ *
+ * @return The number of DM cores per cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_dm_core_num() {
     return SNRT_CLUSTER_DM_CORE_NUM;
 }
 
+/**
+ * @brief Get the number of compute cores per cluster.
+ *
+ * @return The number of compute cores per cluster.
+ */
 inline uint32_t __attribute__((const)) snrt_cluster_compute_core_num() {
     return snrt_cluster_core_num() - snrt_cluster_dm_core_num();
 }
 
+/**
+ * @brief Check if the current core is a compute core.
+ *
+ * @return True if the current core is a compute core, false otherwise.
+ */
 inline int __attribute__((const)) snrt_is_compute_core() {
     return snrt_cluster_core_idx() < snrt_cluster_compute_core_num();
 }
 
+/**
+ * @brief Check if the current core is the last compute core in the cluster.
+ *
+ * @return True if the current core is the last compute core, false otherwise.
+ */
 inline int __attribute__((const)) snrt_cluster_is_last_compute_core() {
     return snrt_cluster_core_idx() == (snrt_cluster_compute_core_num() - 1);
 }
 
+/**
+ * @brief Check if the current core is a data mover (DM) core.
+ *
+ * @return True if the current core is a DM core, false otherwise.
+ */
 inline int __attribute__((const)) snrt_is_dm_core() {
     return !snrt_is_compute_core();
 }