Skip to content

Commit

Permalink
[software] Add fully_connected_f16 kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
mbertuletti committed Dec 13, 2024
1 parent 76302d8 commit 32224c1
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 2 deletions.
53 changes: 53 additions & 0 deletions software/apps/baremetal/messagep_f16/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Copyright 2021 ETH Zurich and University of Bologna.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

// Author: Marco Bertuletti, ETH Zurich

#include <stdint.h>
#include <string.h>

#include "dma.h"
#include "encoding.h"
#include "runtime.h"
#include "synchronization.h"

#include "baremetal/mempool_checks.h"
#include "baremetal/mempool_messagep_f16.h"
#include "data_messagep_f16.h"

__fp16 l1_A[matrix_N]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_W[matrix_M * matrix_N]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));
__fp16 l1_B[matrix_M]
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio")));

int main() {
uint32_t core_id = mempool_get_core_id();
uint32_t num_cores = mempool_get_core_count();
// Initialize barrier and synchronize
mempool_barrier_init(core_id);

// Initialize Matrices 1
if (core_id == 0) {
dma_memcpy_blocking(l1_A, l2_A, (matrix_N) * sizeof(int16_t));
dma_memcpy_blocking(l1_W, l2_W, (matrix_M, matrix_N) * sizeof(int16_t));
if (BIAS == 1) {
dma_memcpy_blocking(l1_B, l2_B, (matrix_M) * sizeof(int16_t));
}
}
mempool_barrier(num_cores);

if (core_id == 0) {
// Execute function to test.
mempool_start_benchmark();
fullyconn_f16s_unrolled4(l1_A, l1_B, l1_W, matrix_M, matrix_N, BIAS, RELU);
mempool_stop_benchmark();
}
mempool_barrier(num_cores);
mempool_check_f16(l1_B, l2_Y, matrix_M, 0.01f, 0);
mempool_barrier(num_cores);

return 0;
}
6 changes: 4 additions & 2 deletions software/data/gendata_header.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import numpy

import gendatalib as datalib
import gendatalib_nn as datalib_nn
import pyflexfloat as ff


Expand Down Expand Up @@ -178,11 +179,9 @@ def get_type(type_string):
"cholesky_q32": {"func": datalib.generate_qcholesky},
"cmatmul_f16": {"func": datalib.generate_fcmatmul},
"cmatmul_q16": {"func": datalib.generate_qcmatmul},
"conv2d_depthwise_f16": {"func": datalib.generate_fconv2d_depthwise_pointwise},
"dotp_f16": {"func": datalib.generate_fdotp},
"dotp_f32": {"func": datalib.generate_fdotp},
"dotp_i32": {"func": datalib.generate_idotp},
"layernorm_f16": {"func": datalib.generate_flayernorm},
"matmul_f16": {"func": datalib.generate_fmatmul},
"matmul_f8": {"func": datalib.generate_fmatmul},
"matmul_f32": {"func": datalib.generate_fmatmul},
Expand All @@ -196,6 +195,9 @@ def get_type(type_string):
"ofdm_f16": {"func": datalib.generate_fofdm},
"fence": {"func": datalib.generate_iarray},
"memcpy": {"func": datalib.generate_iarray},
"conv2d_depthwise_f16": {"func": datalib_nn.generate_fconv2d_depthwise_pointwise},
"layernorm_f16": {"func": datalib_nn.generate_flayernorm},
"messagep_f16": {"func": datalib_nn.generate_ffullyconn},
}

# Check if app_name exists in the function map
Expand Down
16 changes: 16 additions & 0 deletions software/data/gendata_params.hjson
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,22 @@
]
}

"messagep_f16": {
"type": "float16",
"defines": [
("matrix_M", 32)
("matrix_N", 32)
("BIAS", 1)
("RELU", 1)
]
"arrays": [
("__fp16", "l2_A")
("__fp16", "l2_Y")
("__fp16", "l2_W")
("__fp16", "l2_B")
]
},

"mimo_mmse_f16": {
"type": "float16",
"defines": [
Expand Down
20 changes: 20 additions & 0 deletions software/data/gendatalib.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,26 @@ def generate_fconv2d_pointwise(my_type=np.float32, defines={}):
return [A, W, B], defines


def generate_ffullyconn(my_type=np.float32, defines={}):

matrix_M = defines['matrix_M'] # width of input
matrix_N = defines['matrix_N'] # height of input

W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type)
A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type)
if defines['BIAS'] == 1:
B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type)
else:
B = np.zeros((matrix_M), dtype=my_type)

B += np.matmul(W, A).astype(my_type)
if defines['RELU'] == 1:
B = np.maximum(B, 0)
Y = B

return [A, Y, B, W], defines


def generate_flayernorm(my_type=np.float32, defines={}):

# Create matrix
Expand Down
195 changes: 195 additions & 0 deletions software/data/gendatalib_nn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/usr/bin/env python3

# Copyright 2022 ETH Zurich and University of Bologna.
# Solderpad Hardware License, Version 0.51, see LICENSE for details.
# SPDX-License-Identifier: SHL-0.51

# This script generates data for the fp16 matmul.
# Author: Marco Bertuletti <[email protected]>

# The script generates random inputs for the C functions.

import numpy as np
import math


def fconv2d_depthwise(A, W, B):
"""Two-dimensional depthwise convolution.
Uses SAME padding with 0s, a stride of 1 and no dilation. A single output
channel is used per input channel (channel_multiplier=1).
input: input array with shape (height, width, in_depth)
w: filter array with shape (fd, fd, in_depth)
Returns a result with shape (height, width, in_depth).
"""

[matrix_M, matrix_N, matrix_D] = np.shape(A)
kernel_K = np.shape(W)[0]

padw = kernel_K // 2
padded_input = np.pad(A,
pad_width=((padw, padw), (padw, padw), (0, 0)),
mode='constant',
constant_values=0)

for c in range(matrix_D):
# For each input channel separately, apply its corresponsing filter
# to the input.
for i in range(matrix_M):
for j in range(matrix_N):

for fi in range(kernel_K):
for fj in range(kernel_K):
w_element = W[fi, fj, c]
B[i, j, c] += (
padded_input[i + fi, j + fj, c] * w_element)
return B


def fconv2d_pointwise(A, W, B):
"""Depthwise separable convolution.
Performs a pointwise 1x1 convolution with w_pointwise.
Uses SAME padding with 0s, a stride of 1 and no dilation. A single output
channel is used per input channel (channel_multiplier=1) in w_depth.
input: input array with shape (height, width, in_depth)
w_pointwise: pointwise filter array with shape (in_depth, out_depth)
Returns a result with shape (height, width, out_depth).
"""
# First run the depthwise convolution. Its result has the same shape as
# input.

[matrix_M, matrix_N, matrix_D] = np.shape(A)
kernel_D = np.shape(W)[1]

for out_c in range(kernel_D):

for i in range(matrix_M):
for j in range(matrix_N):
for c in range(matrix_D):
w_element = W[c, out_c]
B[i, j, out_c] += A[i, j, c] * w_element
return B


def generate_fconv2d_depthwise_pointwise(my_type=np.float32, defines={}):

matrix_M = defines['matrix_M'] # width of input
matrix_N = defines['matrix_N'] # height of input
matrix_D = defines['matrix_D'] # depth of input

kernel_K = defines['kernel_K'] # Width of kernel
kernel_D = defines['kernel_D'] # Channels of kernel

A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type)
Wd = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type)
Wp = (5 * np.random.rand(matrix_D, kernel_D) - 2.5)

B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type)
B = fconv2d_depthwise(A, Wd, B)
Bd = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type)

Bp = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type)
Bp = fconv2d_pointwise(B, Wp, Bp)
A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
Bp = np.reshape(Bp, (matrix_M * matrix_N * kernel_D)).astype(my_type)
Wd = np.reshape(Wd, (kernel_K * kernel_K * matrix_D)).astype(my_type)
Wp = np.reshape(Wp, (matrix_D * kernel_D), order='F').astype(my_type)

return [A, Wd, Wp, Bd, Bp], defines


def generate_fconv2d_depthwise(my_type=np.float32, defines={}):

matrix_M = defines['matrix_M'] # width of input
matrix_N = defines['matrix_N'] # height of input
matrix_D = defines['matrix_D'] # depth of input

kernel_K = defines['kernel_K'] # Channels of kernel

A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type)
W = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type)
B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type)

B = fconv2d_depthwise(A, W, B)

A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
B = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type)
W = np.reshape(W, (kernel_K * kernel_K * matrix_D)).astype(my_type)

return [A, W, B], defines


def generate_fconv2d_pointwise(my_type=np.float32, defines={}):

matrix_M = defines['matrix_M'] # width of input
matrix_N = defines['matrix_N'] # height of input
matrix_D = defines['matrix_D'] # depth of input

kernel_D = defines['kernel_D'] # Channels of kernel

A = (5 * np.random.rand(matrix_M, matrix_N, matrix_D) - 2.5)
W = (5 * np.random.rand(matrix_D, kernel_D) - 2.5)
A = A.astype(my_type)
W = W.astype(my_type)
B = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type)

B = fconv2d_pointwise(A, W, B)

A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type)
B = np.reshape(B, (matrix_M * matrix_N * kernel_D)).astype(my_type)
W = np.reshape(W, (matrix_D * kernel_D), order='F').astype(my_type)

return [A, W, B], defines


def generate_ffullyconn(my_type=np.float32, defines={}):

matrix_M = defines['matrix_M'] # width of input
matrix_N = defines['matrix_N'] # height of input

W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type)
A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type)
if defines['BIAS'] == 1:
B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type)
else:
B = np.zeros((matrix_M), dtype=my_type)
B += np.matmul(W, A).astype(my_type)
if defines['RELU'] == 1:
Y = np.maximum(B, 0)
else:
Y = B

W = np.reshape(W, (matrix_M * matrix_N)).astype(my_type)

return [A, Y, W, B], defines


def generate_flayernorm(my_type=np.float32, defines={}):

# Create matrix
array_N = defines['array_N']
X = (np.random.rand(array_N)).astype(my_type)

eps = np.array([0.01], dtype=np.float32)
gamma = np.array([np.random.rand() - 0.5], dtype=np.float32)
beta = np.array([np.random.rand() - 0.5], dtype=np.float32)

# Compute mean and variance along the last axis
mean = np.mean(X, axis=-1, keepdims=True).astype(my_type)
var = np.var(X, axis=-1, keepdims=True).astype(my_type)

# Normalize
X_normalized = (X - mean) / np.sqrt(var + eps)
# Scale and shift
Y = gamma * X_normalized + beta

if defines['RELU'] == 1:
Y = np.maximum(Y, 0)

return [X, Y, eps, gamma, beta], defines
Loading

0 comments on commit 32224c1

Please sign in to comment.