[software] Add fully_connected_f16 kernel

pulp-platform · Dec 13, 2024 · 32224c1 · 32224c1
1 parent 76302d8
commit 32224c1
Show file tree

Hide file tree

Showing 6 changed files with 372 additions and 2 deletions.
diff --git a/software/apps/baremetal/messagep_f16/main.c b/software/apps/baremetal/messagep_f16/main.c
@@ -0,0 +1,53 @@
+// Copyright 2021 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Author: Marco Bertuletti, ETH Zurich
+
+#include <stdint.h>
+#include <string.h>
+
+#include "dma.h"
+#include "encoding.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#include "baremetal/mempool_checks.h"
+#include "baremetal/mempool_messagep_f16.h"
+#include "data_messagep_f16.h"
+
+__fp16 l1_A[matrix_N]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_W[matrix_M * matrix_N]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+__fp16 l1_B[matrix_M]
+    __attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
+
+int main() {
+  uint32_t core_id = mempool_get_core_id();
+  uint32_t num_cores = mempool_get_core_count();
+  // Initialize barrier and synchronize
+  mempool_barrier_init(core_id);
+
+  // Initialize Matrices 1
+  if (core_id == 0) {
+    dma_memcpy_blocking(l1_A, l2_A, (matrix_N) * sizeof(int16_t));
+    dma_memcpy_blocking(l1_W, l2_W, (matrix_M, matrix_N) * sizeof(int16_t));
+    if (BIAS == 1) {
+      dma_memcpy_blocking(l1_B, l2_B, (matrix_M) * sizeof(int16_t));
+    }
+  }
+  mempool_barrier(num_cores);
+
+  if (core_id == 0) {
+    // Execute function to test.
+    mempool_start_benchmark();
+    fullyconn_f16s_unrolled4(l1_A, l1_B, l1_W, matrix_M, matrix_N, BIAS, RELU);
+    mempool_stop_benchmark();
+  }
+  mempool_barrier(num_cores);
+  mempool_check_f16(l1_B, l2_Y, matrix_M, 0.01f, 0);
+  mempool_barrier(num_cores);
+
+  return 0;
+}
diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py
@@ -14,6 +14,7 @@
 import numpy
 
 import gendatalib as datalib
+import gendatalib_nn as datalib_nn
 import pyflexfloat as ff
 
 
@@ -178,11 +179,9 @@ def get_type(type_string):
         "cholesky_q32": {"func": datalib.generate_qcholesky},
         "cmatmul_f16": {"func": datalib.generate_fcmatmul},
         "cmatmul_q16": {"func": datalib.generate_qcmatmul},
-        "conv2d_depthwise_f16": {"func": datalib.generate_fconv2d_depthwise_pointwise},
         "dotp_f16": {"func": datalib.generate_fdotp},
         "dotp_f32": {"func": datalib.generate_fdotp},
         "dotp_i32": {"func": datalib.generate_idotp},
-        "layernorm_f16": {"func": datalib.generate_flayernorm},
         "matmul_f16": {"func": datalib.generate_fmatmul},
         "matmul_f8": {"func": datalib.generate_fmatmul},
         "matmul_f32": {"func": datalib.generate_fmatmul},
@@ -196,6 +195,9 @@ def get_type(type_string):
         "ofdm_f16": {"func": datalib.generate_fofdm},
         "fence": {"func": datalib.generate_iarray},
         "memcpy": {"func": datalib.generate_iarray},
+        "conv2d_depthwise_f16": {"func": datalib_nn.generate_fconv2d_depthwise_pointwise},
+        "layernorm_f16": {"func": datalib_nn.generate_flayernorm},
+        "messagep_f16": {"func": datalib_nn.generate_ffullyconn},
     }
 
     # Check if app_name exists in the function map

diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson
@@ -318,6 +318,22 @@
     ]
   }
 
+  "messagep_f16": {
+    "type": "float16",
+    "defines": [
+      ("matrix_M", 32)
+      ("matrix_N", 32)
+      ("BIAS", 1)
+      ("RELU", 1)
+    ]
+    "arrays": [
+      ("__fp16", "l2_A")
+      ("__fp16", "l2_Y")
+      ("__fp16", "l2_W")
+      ("__fp16", "l2_B")
+    ]
+  },
+
   "mimo_mmse_f16": {
     "type": "float16",
     "defines": [

diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py
@@ -355,6 +355,26 @@ def generate_fconv2d_pointwise(my_type=np.float32, defines={}):
     return [A, W, B], defines
 
 
+def generate_ffullyconn(my_type=np.float32, defines={}):
+
+    matrix_M = defines['matrix_M']  # width of input
+    matrix_N = defines['matrix_N']  # height of input
+
+    W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type)
+    A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type)
+    if defines['BIAS'] == 1:
+        B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type)
+    else:
+        B = np.zeros((matrix_M), dtype=my_type)
+
+    B += np.matmul(W, A).astype(my_type)
+    if defines['RELU'] == 1:
+        B = np.maximum(B, 0)
+    Y = B
+
+    return [A, Y, B, W], defines
+
+
 def generate_flayernorm(my_type=np.float32, defines={}):
 
     # Create matrix

diff --git a/software/data/gendatalib_nn.py b/software/data/gendatalib_nn.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+
+# Copyright 2022 ETH Zurich and University of Bologna.
+# Solderpad Hardware License, Version 0.51, see LICENSE for details.
+# SPDX-License-Identifier: SHL-0.51
+
+# This script generates data for the fp16 matmul.
+# Author: Marco Bertuletti <[email protected]>
+
+# The script generates random inputs for the C functions.
+
+import numpy as np
+import math
+
+
+def fconv2d_depthwise(A, W, B):
+    """Two-dimensional depthwise convolution.
+
+    Uses SAME padding with 0s, a stride of 1 and no dilation. A single output
+    channel is used per input channel (channel_multiplier=1).
+
+    input: input array with shape (height, width, in_depth)
+    w: filter array with shape (fd, fd, in_depth)
+
+    Returns a result with shape (height, width, in_depth).
+    """
+
+    [matrix_M, matrix_N, matrix_D] = np.shape(A)
+    kernel_K = np.shape(W)[0]
+
+    padw = kernel_K // 2
+    padded_input = np.pad(A,
+                          pad_width=((padw, padw), (padw, padw), (0, 0)),
+                          mode='constant',
+                          constant_values=0)
+
+    for c in range(matrix_D):
+        # For each input channel separately, apply its corresponsing filter
+        # to the input.
+        for i in range(matrix_M):
+            for j in range(matrix_N):
+
+                for fi in range(kernel_K):
+                    for fj in range(kernel_K):
+                        w_element = W[fi, fj, c]
+                        B[i, j, c] += (
+                            padded_input[i + fi, j + fj, c] * w_element)
+    return B
+
+
+def fconv2d_pointwise(A, W, B):
+    """Depthwise separable convolution.
+
+    Performs a pointwise 1x1 convolution with w_pointwise.
+
+    Uses SAME padding with 0s, a stride of 1 and no dilation. A single output
+    channel is used per input channel (channel_multiplier=1) in w_depth.
+
+    input: input array with shape (height, width, in_depth)
+    w_pointwise: pointwise filter array with shape (in_depth, out_depth)
+
+    Returns a result with shape (height, width, out_depth).
+    """
+    # First run the depthwise convolution. Its result has the same shape as
+    # input.
+
+    [matrix_M, matrix_N, matrix_D] = np.shape(A)
+    kernel_D = np.shape(W)[1]
+
+    for out_c in range(kernel_D):
+
+        for i in range(matrix_M):
+            for j in range(matrix_N):
+                for c in range(matrix_D):
+                    w_element = W[c, out_c]
+                    B[i, j, out_c] += A[i, j, c] * w_element
+    return B
+
+
+def generate_fconv2d_depthwise_pointwise(my_type=np.float32, defines={}):
+
+    matrix_M = defines['matrix_M']  # width of input
+    matrix_N = defines['matrix_N']  # height of input
+    matrix_D = defines['matrix_D']  # depth of input
+
+    kernel_K = defines['kernel_K']  # Width of kernel
+    kernel_D = defines['kernel_D']  # Channels of kernel
+
+    A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type)
+    Wd = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type)
+    Wp = (5 * np.random.rand(matrix_D, kernel_D) - 2.5)
+
+    B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type)
+    B = fconv2d_depthwise(A, Wd, B)
+    Bd = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type)
+
+    Bp = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type)
+    Bp = fconv2d_pointwise(B, Wp, Bp)
+    A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
+    Bp = np.reshape(Bp, (matrix_M * matrix_N * kernel_D)).astype(my_type)
+    Wd = np.reshape(Wd, (kernel_K * kernel_K * matrix_D)).astype(my_type)
+    Wp = np.reshape(Wp, (matrix_D * kernel_D), order='F').astype(my_type)
+
+    return [A, Wd, Wp, Bd, Bp], defines
+
+
+def generate_fconv2d_depthwise(my_type=np.float32, defines={}):
+
+    matrix_M = defines['matrix_M']  # width of input
+    matrix_N = defines['matrix_N']  # height of input
+    matrix_D = defines['matrix_D']  # depth of input
+
+    kernel_K = defines['kernel_K']  # Channels of kernel
+
+    A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type)
+    W = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type)
+    B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type)
+
+    B = fconv2d_depthwise(A, W, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
+    B = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type)
+    W = np.reshape(W, (kernel_K * kernel_K * matrix_D)).astype(my_type)
+
+    return [A, W, B], defines
+
+
+def generate_fconv2d_pointwise(my_type=np.float32, defines={}):
+
+    matrix_M = defines['matrix_M']  # width of input
+    matrix_N = defines['matrix_N']  # height of input
+    matrix_D = defines['matrix_D']  # depth of input
+
+    kernel_D = defines['kernel_D']  # Channels of kernel
+
+    A = (5 * np.random.rand(matrix_M, matrix_N, matrix_D) - 2.5)
+    W = (5 * np.random.rand(matrix_D, kernel_D) - 2.5)
+    A = A.astype(my_type)
+    W = W.astype(my_type)
+    B = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type)
+
+    B = fconv2d_pointwise(A, W, B)
+
+    A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
+    B = np.reshape(B, (matrix_M * matrix_N * kernel_D)).astype(my_type)
+    W = np.reshape(W, (matrix_D * kernel_D), order='F').astype(my_type)
+
+    return [A, W, B], defines
+
+
+def generate_ffullyconn(my_type=np.float32, defines={}):
+
+    matrix_M = defines['matrix_M']  # width of input
+    matrix_N = defines['matrix_N']  # height of input
+
+    W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type)
+    A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type)
+    if defines['BIAS'] == 1:
+        B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type)
+    else:
+        B = np.zeros((matrix_M), dtype=my_type)
+    B += np.matmul(W, A).astype(my_type)
+    if defines['RELU'] == 1:
+        Y = np.maximum(B, 0)
+    else:
+        Y = B
+
+    W = np.reshape(W, (matrix_M * matrix_N)).astype(my_type)
+
+    return [A, Y, W, B], defines
+
+
+def generate_flayernorm(my_type=np.float32, defines={}):
+
+    # Create matrix
+    array_N = defines['array_N']
+    X = (np.random.rand(array_N)).astype(my_type)
+
+    eps = np.array([0.01], dtype=np.float32)
+    gamma = np.array([np.random.rand() - 0.5], dtype=np.float32)
+    beta = np.array([np.random.rand() - 0.5], dtype=np.float32)
+
+    # Compute mean and variance along the last axis
+    mean = np.mean(X, axis=-1, keepdims=True).astype(my_type)
+    var = np.var(X, axis=-1, keepdims=True).astype(my_type)
+
+    # Normalize
+    X_normalized = (X - mean) / np.sqrt(var + eps)
+    # Scale and shift
+    Y = gamma * X_normalized + beta
+
+    if defines['RELU'] == 1:
+        Y = np.maximum(Y, 0)
+
+    return [X, Y, eps, gamma, beta], defines