diff --git a/software/apps/baremetal/messagep_f16/main.c b/software/apps/baremetal/messagep_f16/main.c
index 8e7ab3d2c..dd1bafb0c 100644
--- a/software/apps/baremetal/messagep_f16/main.c
+++ b/software/apps/baremetal/messagep_f16/main.c
@@ -16,11 +16,11 @@
 #include "baremetal/mempool_messagep_f16.h"
 #include "data_messagep_f16.h"
 
-__fp16 l1_A[matrix_N]
+__fp16 l1_A[matrix_P * matrix_M * matrix_N * matrix_D]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_W[matrix_M * matrix_N]
+__fp16 l1_B[matrix_P * matrix_M * matrix_N * matrix_D]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
-__fp16 l1_B[matrix_M]
+__fp16 l1_HL[matrix_P * matrix_M * matrix_N * width_HL]
     __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
 
 int main() {
@@ -31,22 +31,26 @@ int main() {
 
   // Initialize Matrices 1
   if (core_id == 0) {
-    dma_memcpy_blocking(l1_A, l2_A, (matrix_N) * sizeof(int16_t));
-    dma_memcpy_blocking(l1_W, l2_W, (matrix_M, matrix_N) * sizeof(int16_t));
-    if (BIAS == 1) {
-      dma_memcpy_blocking(l1_B, l2_B, (matrix_M) * sizeof(int16_t));
-    }
+    dma_memcpy_blocking(l1_A, l2_A,
+                        (matrix_P * matrix_M * matrix_N * matrix_D) *
+                            sizeof(int16_t));
+    dma_memcpy_blocking(l1_HL, l2_HL,
+                        (matrix_P * matrix_M * matrix_N * width_HL) *
+                            sizeof(int16_t));
   }
   mempool_barrier(num_cores);
 
   if (core_id == 0) {
     // Execute function to test.
     mempool_start_benchmark();
-    fullyconn_f16s_unrolled4(l1_A, l1_B, l1_W, matrix_M, matrix_N, BIAS, RELU);
+    messagep_f16s_unrolled4(l1_A, l1_B, matrix_P, matrix_M, matrix_N, matrix_D,
+                            FC_LAYER, l1_HL, l2_W_fc1, l2_W_fc2, width_HL, BIAS,
+                            RELU);
     mempool_stop_benchmark();
   }
   mempool_barrier(num_cores);
-  mempool_check_f16(l1_B, l2_Y, matrix_M, 0.01f, 0);
+  mempool_check_f16(l1_B, l2_B, matrix_P * matrix_M * matrix_N * matrix_D,
+                    0.01f, 0);
   mempool_barrier(num_cores);
 
   return 0;
diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py
index 4ae1111c8..ce8c92712 100644
--- a/software/data/gendata_header.py
+++ b/software/data/gendata_header.py
@@ -197,7 +197,7 @@ def get_type(type_string):
         "memcpy": {"func": datalib.generate_iarray},
         "conv2d_depthwise_f16": {"func": datalib_nn.generate_fconv2d_depthwise_pointwise},
         "layernorm_f16": {"func": datalib_nn.generate_flayernorm},
-        "messagep_f16": {"func": datalib_nn.generate_ffullyconn},
+        "messagep_f16": {"func": datalib_nn.generate_fmessagep},
     }
 
     # Check if app_name exists in the function map
diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson
index 48257ee02..0fdba4138 100644
--- a/software/data/gendata_params.hjson
+++ b/software/data/gendata_params.hjson
@@ -318,19 +318,40 @@
     ]
   }
 
+//  "messagep_f16": {
+//    "type": "float16",
+//    "defines": [
+//      ("matrix_M", 32)
+//      ("matrix_N", 32)
+//      ("BIAS", 1)
+//      ("RELU", 1)
+//    ]
+//    "arrays": [
+//      ("__fp16", "l2_A")
+//      ("__fp16", "l2_Y")
+//      ("__fp16", "l2_W")
+//      ("__fp16", "l2_B")
+//    ]
+//  },
+
   "messagep_f16": {
     "type": "float16",
     "defines": [
-      ("matrix_M", 32)
-      ("matrix_N", 32)
-      ("BIAS", 1)
-      ("RELU", 1)
+      ("matrix_M", 256)
+      ("matrix_N", 14)
+      ("matrix_P", 4)
+      ("matrix_D", 4)
+      ("width_HL", 8)
+      ("FC_LAYER", 1)
+      ("BIAS",     1)
+      ("RELU",     1)
     ]
     "arrays": [
       ("__fp16", "l2_A")
-      ("__fp16", "l2_Y")
-      ("__fp16", "l2_W")
       ("__fp16", "l2_B")
+      ("__fp16", "l2_HL")
+      ("__fp16", "l2_W_fc1")
+      ("__fp16", "l2_W_fc2")
     ]
   },
 
diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py
index 4008bb66c..c847140e2 100644
--- a/software/data/gendatalib.py
+++ b/software/data/gendatalib.py
@@ -220,186 +220,6 @@ def generate_fcmatmul(my_type=np.float32, defines={}):
     return [A, B, C], defines
 
 
-def fconv2d_depthwise(A, W, B):
-    """Two-dimensional depthwise convolution.
-
-    Uses SAME padding with 0s, a stride of 1 and no dilation. A single output
-    channel is used per input channel (channel_multiplier=1).
-
-    input: input array with shape (height, width, in_depth)
-    w: filter array with shape (fd, fd, in_depth)
-
-    Returns a result with shape (height, width, in_depth).
-    """
-
-    [matrix_M, matrix_N, matrix_D] = np.shape(A)
-    kernel_K = np.shape(W)[0]
-
-    padw = kernel_K // 2
-    padded_input = np.pad(A,
-                          pad_width=((padw, padw), (padw, padw), (0, 0)),
-                          mode='constant',
-                          constant_values=0)
-
-    for c in range(matrix_D):
-        # For each input channel separately, apply its corresponsing filter
-        # to the input.
-        for i in range(matrix_M):
-            for j in range(matrix_N):
-
-                for fi in range(kernel_K):
-                    for fj in range(kernel_K):
-                        w_element = W[fi, fj, c]
-                        B[i, j, c] += (
-                            padded_input[i + fi, j + fj, c] * w_element)
-    return B
-
-
-def fconv2d_pointwise(A, W, B):
-    """Depthwise separable convolution.
-
-    Performs a pointwise 1x1 convolution with w_pointwise.
-
-    Uses SAME padding with 0s, a stride of 1 and no dilation. A single output
-    channel is used per input channel (channel_multiplier=1) in w_depth.
-
-    input: input array with shape (height, width, in_depth)
-    w_pointwise: pointwise filter array with shape (in_depth, out_depth)
-
-    Returns a result with shape (height, width, out_depth).
-    """
-    # First run the depthwise convolution. Its result has the same shape as
-    # input.
-
-    [matrix_M, matrix_N, matrix_D] = np.shape(A)
-    kernel_D = np.shape(W)[1]
-
-    for out_c in range(kernel_D):
-
-        for i in range(matrix_M):
-            for j in range(matrix_N):
-                for c in range(matrix_D):
-                    w_element = W[c, out_c]
-                    B[i, j, out_c] += A[i, j, c] * w_element
-    return B
-
-
-def generate_fconv2d_depthwise_pointwise(my_type=np.float32, defines={}):
-
-    matrix_M = defines['matrix_M']  # width of input
-    matrix_N = defines['matrix_N']  # height of input
-    matrix_D = defines['matrix_D']  # depth of input
-
-    kernel_K = defines['kernel_K']  # Width of kernel
-    kernel_D = defines['kernel_D']  # Channels of kernel
-
-    A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type)
-    Wd = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type)
-    Wp = (5 * np.random.rand(matrix_D, kernel_D) - 2.5)
-
-    B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type)
-    B = fconv2d_depthwise(A, Wd, B)
-    Bd = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type)
-
-    Bp = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type)
-    Bp = fconv2d_pointwise(B, Wp, Bp)
-    A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
-    Bp = np.reshape(Bp, (matrix_M * matrix_N * kernel_D)).astype(my_type)
-    Wd = np.reshape(Wd, (kernel_K * kernel_K * matrix_D)).astype(my_type)
-    Wp = np.reshape(Wp, (matrix_D * kernel_D), order='F').astype(my_type)
-
-    return [A, Wd, Wp, Bd, Bp], defines
-
-
-def generate_fconv2d_depthwise(my_type=np.float32, defines={}):
-
-    matrix_M = defines['matrix_M']  # width of input
-    matrix_N = defines['matrix_N']  # height of input
-    matrix_D = defines['matrix_D']  # depth of input
-
-    kernel_K = defines['kernel_K']  # Channels of kernel
-
-    A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type)
-    W = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type)
-    B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type)
-
-    B = fconv2d_depthwise(A, W, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
-    B = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type)
-    W = np.reshape(W, (kernel_K * kernel_K * matrix_D)).astype(my_type)
-
-    return [A, W, B], defines
-
-
-def generate_fconv2d_pointwise(my_type=np.float32, defines={}):
-
-    matrix_M = defines['matrix_M']  # width of input
-    matrix_N = defines['matrix_N']  # height of input
-    matrix_D = defines['matrix_D']  # depth of input
-
-    kernel_D = defines['kernel_D']  # Channels of kernel
-
-    A = (5 * np.random.rand(matrix_M, matrix_N, matrix_D) - 2.5)
-    W = (5 * np.random.rand(matrix_D, kernel_D) - 2.5)
-    A = A.astype(my_type)
-    W = W.astype(my_type)
-    B = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type)
-
-    B = fconv2d_pointwise(A, W, B)
-
-    A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
-    B = np.reshape(B, (matrix_M * matrix_N * kernel_D)).astype(my_type)
-    W = np.reshape(W, (matrix_D * kernel_D), order='F').astype(my_type)
-
-    return [A, W, B], defines
-
-
-def generate_ffullyconn(my_type=np.float32, defines={}):
-
-    matrix_M = defines['matrix_M']  # width of input
-    matrix_N = defines['matrix_N']  # height of input
-
-    W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type)
-    A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type)
-    if defines['BIAS'] == 1:
-        B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type)
-    else:
-        B = np.zeros((matrix_M), dtype=my_type)
-
-    B += np.matmul(W, A).astype(my_type)
-    if defines['RELU'] == 1:
-        B = np.maximum(B, 0)
-    Y = B
-
-    return [A, Y, B, W], defines
-
-
-def generate_flayernorm(my_type=np.float32, defines={}):
-
-    # Create matrix
-    array_N = defines['array_N']
-    X = (np.random.rand(array_N)).astype(my_type)
-
-    eps = np.array([0.01], dtype=np.float32)
-    gamma = np.array([np.random.rand() - 0.5], dtype=np.float32)
-    beta = np.array([np.random.rand() - 0.5], dtype=np.float32)
-
-    # Compute mean and variance along the last axis
-    mean = np.mean(X, axis=-1, keepdims=True).astype(my_type)
-    var = np.var(X, axis=-1, keepdims=True).astype(my_type)
-
-    # Normalize
-    X_normalized = (X - mean) / np.sqrt(var + eps)
-    # Scale and shift
-    Y = gamma * X_normalized + beta
-
-    if defines['RELU'] == 1:
-        Y = np.maximum(Y, 0)
-
-    return [X, Y, eps, gamma, beta], defines
-
-
 def generate_fmatmul(my_type=np.float32, defines={}):
 
     # Create matrix
diff --git a/software/data/gendatalib_nn.py b/software/data/gendatalib_nn.py
index c69b50f6e..f201a100d 100644
--- a/software/data/gendatalib_nn.py
+++ b/software/data/gendatalib_nn.py
@@ -193,3 +193,60 @@ def generate_flayernorm(my_type=np.float32, defines={}):
         Y = np.maximum(Y, 0)
 
     return [X, Y, eps, gamma, beta], defines
+
+
+def generate_fmessagep(my_type=np.float32, defines={}):
+
+    matrix_P = defines['matrix_P']  # number of graph nodes
+    matrix_M = defines['matrix_M']  # width of input
+    matrix_N = defines['matrix_N']  # height of input
+    matrix_D = defines['matrix_D']  # depth of input
+    width_HL = defines['width_HL']  # depth of input
+
+    A = np.random.rand(matrix_P, matrix_M, matrix_N, matrix_D).astype(my_type)
+    B = np.zeros((matrix_P, matrix_M, matrix_N, matrix_D), dtype=my_type)
+
+    # Outputs and parameters of the hidden-layer
+    W_fc1 = np.random.rand(matrix_P, width_HL, matrix_D).astype(my_type)
+    W_fc2 = np.random.rand(matrix_P, matrix_D, width_HL).astype(my_type)
+    if defines['BIAS'] == 1:
+        HL = np.random.rand(matrix_P, matrix_M, matrix_N, width_HL)
+        HL = HL.astype(my_type)
+    else:
+        HL = np.zeros((matrix_P, matrix_M, matrix_N, width_HL))
+        HL = HL.astype(my_type)
+
+    # Loops over the 2D image
+    for i in range(matrix_M):
+        for j in range(matrix_N):
+            # Loops over the message passing instances
+            for p in range(matrix_P):
+
+                if defines['FC_LAYER'] == 1:
+                    # Apply hidden-layer
+                    HL[p, i, j, :] += np.matmul(W_fc1[p, :], A[p, i, j, :])
+                    if defines['RELU'] == 1:
+                        HL = np.maximum(HL, 0)
+                    A[p, i, j, :] = np.matmul(W_fc2[p, :], HL[p, i, j, :])
+
+                # Loop over depth and sum the message passing instances
+                for d in range(matrix_D):
+                    sum_val = np.float16(0.0)
+                    for np_idx in range(matrix_P):
+                        if np_idx != p:
+                            sum_val += A[np_idx, i, j, d]
+
+                    # Divide sum
+                    sum_val = sum_val / np.float16(matrix_P)
+                    B[p, i, j, d] = sum_val
+
+    A = np.reshape(A, (matrix_P * matrix_M * matrix_N * matrix_D))
+    B = np.reshape(B, (matrix_P * matrix_M * matrix_N * matrix_D))
+    HL = np.reshape(HL, (matrix_P * matrix_M * matrix_N * width_HL))
+    W_fc1 = np.reshape(W_fc1, (matrix_P * width_HL * matrix_D))
+    W_fc2 = np.reshape(W_fc2, (matrix_P * matrix_D * width_HL))
+
+    A = A.astype(my_type)
+    B = B.astype(my_type)
+
+    return [A, B, HL, W_fc1, W_fc2], defines
diff --git a/software/kernels/baremetal/mempool_messagep_f16.h b/software/kernels/baremetal/mempool_messagep_f16.h
index 3e17cb56c..70be2ce5f 100644
--- a/software/kernels/baremetal/mempool_messagep_f16.h
+++ b/software/kernels/baremetal/mempool_messagep_f16.h
@@ -7,16 +7,16 @@
 #pragma once
 #include "builtins_v2.h"
 
-void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B,
-                    __fp16 *__restrict__ W, uint32_t M, uint32_t N,
-                    uint32_t bias, uint32_t relu) {
+static inline void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B,
+                                  __fp16 *__restrict__ W, uint32_t wM,
+                                  uint32_t wN, uint32_t bias, uint32_t relu) {
 
   uint32_t i, j;
   v2h a, w;
   __fp16 b_f16;
   float b;
 
-  for (i = 0; i < M; i++) {
+  for (i = 0; i < wM; i++) {
     // Initialize accumulator
     if (bias) {
       b_f16 = B[i];
@@ -25,9 +25,9 @@ void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B,
       b = 0.0f;
     }
     // Matrix vector multiply
-    for (j = 0; j < N; j += 2) {
+    for (j = 0; j < wN; j += 2) {
       a = *(v2h *)&A[j];
-      w = *(v2h *)&W[i * N + j];
+      w = *(v2h *)&W[i * wN + j];
       asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a), "r"(w));
     }
     // ReLU
@@ -40,9 +40,10 @@ void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B,
   return;
 }
 
-void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B,
-                              __fp16 *__restrict__ W, uint32_t M, uint32_t N,
-                              uint32_t bias, uint32_t relu) {
+static inline void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A,
+                                            __fp16 *B, __fp16 *__restrict__ W,
+                                            uint32_t wM, uint32_t wN,
+                                            uint32_t bias, uint32_t relu) {
 
   uint32_t i, j;
   v2h w0, w1, w2, w3;
@@ -50,7 +51,7 @@ void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B,
   __fp16 b_f16;
   float b;
 
-  for (i = 0; i < M; i++) {
+  for (i = 0; i < wM; i++) {
     // Initialize accumulator
     if (bias) {
       b_f16 = B[i];
@@ -59,15 +60,15 @@ void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B,
       b = 0.0f;
     }
     // Matrix vector multiply
-    for (j = 0; j < N; j += 2) {
+    for (j = 0; j < wN; j += 2) {
       a0 = *(v2h *)&A[j + 0];
       a1 = *(v2h *)&A[j + 2];
       a2 = *(v2h *)&A[j + 4];
       a3 = *(v2h *)&A[j + 6];
-      w0 = *(v2h *)&W[i * N + j + 0];
-      w1 = *(v2h *)&W[i * N + j + 2];
-      w2 = *(v2h *)&W[i * N + j + 4];
-      w3 = *(v2h *)&W[i * N + j + 6];
+      w0 = *(v2h *)&W[i * wN + j + 0];
+      w1 = *(v2h *)&W[i * wN + j + 2];
+      w2 = *(v2h *)&W[i * wN + j + 4];
+      w3 = *(v2h *)&W[i * wN + j + 6];
       asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a0), "r"(w0));
       asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a1), "r"(w1));
       asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a2), "r"(w2));
@@ -82,3 +83,171 @@ void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B,
 
   return;
 }
+
+/*
+  The kernel combines the information from matrix_P tensors by averaging over
+  the matrix_P dimension matrix_P: message passing instances of the tensor
+  matrix_M: rows of the input tensor (as in 2D matrix)
+  matrix_N: rows of the input tensor (as in 2D matrix)
+  matrix_D: depth of the input tensor
+
+  Parameters of optional hiddel layer:
+  HL: pointer to hiddel layer output
+  W_fc1: weights of first fully-connected layer
+  W_fc2: weights of second fully-connected layer
+  wHL: depth of the hidden-layer
+  bias: optional bias
+  relu: optional relu
+*/
+void messagep_f16s(__fp16 *A, __fp16 *B, uint32_t matrix_P, uint32_t matrix_M,
+                   uint32_t matrix_N, uint32_t matrix_D, uint32_t fc_layer,
+                   __fp16 __attribute__((unused)) * HL,
+                   __fp16 __attribute__((unused)) * W_fc1,
+                   __fp16 __attribute__((unused)) * W_fc2,
+                   uint32_t __attribute__((unused)) wHL,
+                   uint32_t __attribute__((unused)) bias,
+                   uint32_t __attribute__((unused)) relu) {
+
+  uint32_t p, i, j, d, mp;
+  v2h a;
+  v2h sum;
+
+  __fp16 N_f16;
+  asm volatile("fcvt.h.wu %0, %1" : "+r"(N_f16) : "r"(matrix_P));
+  asm volatile("pv.pack %0, %0, %0" : "+r"(N_f16));
+
+  // Loops over the 2D image
+  for (i = 0; i < matrix_M; i++) {
+    for (j = 0; j < matrix_N; j++) {
+
+      // Apply FC-layer
+      if (fc_layer) {
+        // Loops over the message passing instances
+        for (p = 0; p < matrix_P; p++) {
+          // Compute the dense layer (wHL == depth of the hidden layer)
+          __fp16 *ptr1 = &A[p * matrix_M * matrix_N * matrix_D +
+                            i * matrix_N * matrix_D + j * matrix_D];
+          __fp16 *ptr2 = &HL[p * matrix_M * matrix_N * matrix_D +
+                             i * matrix_N * wHL + j * wHL];
+          fullyconn_f16s(ptr1, ptr2, &W_fc1[p * wHL * matrix_D], wHL, matrix_D,
+                         bias, relu);
+          fullyconn_f16s(ptr2, ptr1, &W_fc2[p * matrix_D * wHL], matrix_D, wHL,
+                         bias, relu);
+        }
+      }
+
+      // Loops over the message passing instances
+      for (p = 0; p < matrix_P; p++) {
+        // Loop over depth and sum the message passing instances
+        for (d = 0; d < matrix_D; d += 2) {
+          sum = (v2h)0.0f;
+          for (mp = p + 1; mp < matrix_P; mp++) {
+            a = *(v2h *)&A[mp * matrix_M * matrix_N * matrix_D +
+                           i * matrix_N * matrix_D + j * matrix_D + d];
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(sum) : "r"(a));
+          }
+          for (mp = 0; mp < p; mp++) {
+            a = *(v2h *)&A[mp * matrix_M * matrix_N * matrix_D +
+                           i * matrix_N * matrix_D + j * matrix_D + d];
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(sum) : "r"(a));
+          }
+          // Divide sum
+          asm volatile("vfdiv.h %0, %0, %1" : "+r"(sum) : "r"(N_f16));
+          *((v2h *)&B[p * matrix_M * matrix_N * matrix_D +
+                      i * matrix_N * matrix_D + j * matrix_D + d]) = sum;
+        }
+      }
+    }
+  }
+
+  return;
+}
+
+void messagep_f16s_unrolled4(__fp16 *A, __fp16 *B, uint32_t matrix_P,
+                             uint32_t matrix_M, uint32_t matrix_N,
+                             uint32_t matrix_D, uint32_t fc_layer,
+                             __fp16 __attribute__((unused)) * HL,
+                             __fp16 __attribute__((unused)) * W_fc1,
+                             __fp16 __attribute__((unused)) * W_fc2,
+                             uint32_t __attribute__((unused)) wHL,
+                             uint32_t __attribute__((unused)) bias,
+                             uint32_t __attribute__((unused)) relu) {
+
+  uint32_t p, i, j, d, mp;
+  v2h a0, a1, a2, a3;
+  v2h s0, s1, s2, s3;
+
+  __fp16 N_f16;
+  asm volatile("fcvt.h.wu %0, %1" : "+r"(N_f16) : "r"(matrix_P));
+  asm volatile("pv.pack %0, %0, %0" : "+r"(N_f16));
+
+  // Loops over the 2D image
+  for (i = 0; i < matrix_M; i++) {
+    for (j = 0; j < matrix_N; j++) {
+
+      // Apply FC-layer
+      if (fc_layer) {
+        // Loops over the message passing instances
+        for (p = 0; p < matrix_P; p++) {
+          // Compute the dense layer (wHL == depth of the hidden layer)
+          __fp16 *ptr1 = &A[p * matrix_M * matrix_N * matrix_D +
+                            i * matrix_N * matrix_D + j * matrix_D];
+          __fp16 *ptr2 = &HL[p * matrix_M * matrix_N * matrix_D +
+                             i * matrix_N * wHL + j * wHL];
+          fullyconn_f16s_unrolled4(ptr1, ptr2, &W_fc1[p * wHL * matrix_D], wHL,
+                                   matrix_D, bias, relu);
+          fullyconn_f16s_unrolled4(ptr2, ptr1, &W_fc2[p * matrix_D * wHL],
+                                   matrix_D, wHL, bias, relu);
+        }
+      }
+
+      // Loops over the message passing instances
+      for (p = 0; p < matrix_P; p++) {
+        // Loop over depth and sum the message passing instances
+        for (d = 0; d < matrix_D; d += 8) {
+          s0 = (v2h)0.0f;
+          s1 = (v2h)0.0f;
+          s2 = (v2h)0.0f;
+          s3 = (v2h)0.0f;
+          for (mp = p + 1; mp < matrix_P; mp++) {
+            __fp16 *a_ptr = &A[mp * matrix_M * matrix_N * matrix_D +
+                               i * matrix_N * matrix_D + j * matrix_D];
+            a0 = *(v2h *)&a_ptr[d];
+            a1 = *(v2h *)&a_ptr[d + 2];
+            a2 = *(v2h *)&a_ptr[d + 4];
+            a3 = *(v2h *)&a_ptr[d + 6];
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s0) : "r"(a0));
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s1) : "r"(a1));
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s2) : "r"(a2));
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s3) : "r"(a3));
+          }
+          for (mp = 0; mp < p; mp++) {
+            __fp16 *a_ptr = &A[mp * matrix_M * matrix_N * matrix_D +
+                               i * matrix_N * matrix_D + j * matrix_D];
+            a0 = *(v2h *)&a_ptr[d];
+            a1 = *(v2h *)&a_ptr[d + 2];
+            a2 = *(v2h *)&a_ptr[d + 4];
+            a3 = *(v2h *)&a_ptr[d + 6];
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s0) : "r"(a0));
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s1) : "r"(a1));
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s2) : "r"(a2));
+            asm volatile("vfadd.h %0, %0, %1" : "+r"(s3) : "r"(a3));
+          }
+          // Divide sum
+          asm volatile("vfdiv.h %0, %0, %1" : "+r"(s0) : "r"(N_f16));
+          asm volatile("vfdiv.h %0, %0, %1" : "+r"(s1) : "r"(N_f16));
+          asm volatile("vfdiv.h %0, %0, %1" : "+r"(s2) : "r"(N_f16));
+          asm volatile("vfdiv.h %0, %0, %1" : "+r"(s3) : "r"(N_f16));
+          __fp16 *b_ptr = &B[p * matrix_M * matrix_N * matrix_D +
+                             i * matrix_N * matrix_D + j * matrix_D];
+          *((v2h *)&b_ptr[d + 0]) = s0;
+          *((v2h *)&b_ptr[d + 2]) = s1;
+          *((v2h *)&b_ptr[d + 4]) = s2;
+          *((v2h *)&b_ptr[d + 6]) = s3;
+        }
+      }
+    }
+  }
+
+  return;
+}