diff --git a/software/apps/baremetal/messagep_f16/main.c b/software/apps/baremetal/messagep_f16/main.c new file mode 100644 index 000000000..8e7ab3d2c --- /dev/null +++ b/software/apps/baremetal/messagep_f16/main.c @@ -0,0 +1,53 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#include +#include + +#include "dma.h" +#include "encoding.h" +#include "runtime.h" +#include "synchronization.h" + +#include "baremetal/mempool_checks.h" +#include "baremetal/mempool_messagep_f16.h" +#include "data_messagep_f16.h" + +__fp16 l1_A[matrix_N] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_W[matrix_M * matrix_N] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); +__fp16 l1_B[matrix_M] + __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); + +int main() { + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + // Initialize barrier and synchronize + mempool_barrier_init(core_id); + + // Initialize Matrices 1 + if (core_id == 0) { + dma_memcpy_blocking(l1_A, l2_A, (matrix_N) * sizeof(int16_t)); + dma_memcpy_blocking(l1_W, l2_W, (matrix_M, matrix_N) * sizeof(int16_t)); + if (BIAS == 1) { + dma_memcpy_blocking(l1_B, l2_B, (matrix_M) * sizeof(int16_t)); + } + } + mempool_barrier(num_cores); + + if (core_id == 0) { + // Execute function to test. + mempool_start_benchmark(); + fullyconn_f16s_unrolled4(l1_A, l1_B, l1_W, matrix_M, matrix_N, BIAS, RELU); + mempool_stop_benchmark(); + } + mempool_barrier(num_cores); + mempool_check_f16(l1_B, l2_Y, matrix_M, 0.01f, 0); + mempool_barrier(num_cores); + + return 0; +} diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py index 42a814397..4ae1111c8 100644 --- a/software/data/gendata_header.py +++ b/software/data/gendata_header.py @@ -14,6 +14,7 @@ import numpy import gendatalib as datalib +import gendatalib_nn as datalib_nn import pyflexfloat as ff @@ -178,11 +179,9 @@ def get_type(type_string): "cholesky_q32": {"func": datalib.generate_qcholesky}, "cmatmul_f16": {"func": datalib.generate_fcmatmul}, "cmatmul_q16": {"func": datalib.generate_qcmatmul}, - "conv2d_depthwise_f16": {"func": datalib.generate_fconv2d_depthwise_pointwise}, "dotp_f16": {"func": datalib.generate_fdotp}, "dotp_f32": {"func": datalib.generate_fdotp}, "dotp_i32": {"func": datalib.generate_idotp}, - "layernorm_f16": {"func": datalib.generate_flayernorm}, "matmul_f16": {"func": datalib.generate_fmatmul}, "matmul_f8": {"func": datalib.generate_fmatmul}, "matmul_f32": {"func": datalib.generate_fmatmul}, @@ -196,6 +195,9 @@ def get_type(type_string): "ofdm_f16": {"func": datalib.generate_fofdm}, "fence": {"func": datalib.generate_iarray}, "memcpy": {"func": datalib.generate_iarray}, + "conv2d_depthwise_f16": {"func": datalib_nn.generate_fconv2d_depthwise_pointwise}, + "layernorm_f16": {"func": datalib_nn.generate_flayernorm}, + "messagep_f16": {"func": datalib_nn.generate_ffullyconn}, } # Check if app_name exists in the function map diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson index 152693ca5..48257ee02 100644 --- a/software/data/gendata_params.hjson +++ b/software/data/gendata_params.hjson @@ -318,6 +318,22 @@ ] } + "messagep_f16": { + "type": "float16", + "defines": [ + ("matrix_M", 32) + ("matrix_N", 32) + ("BIAS", 1) + ("RELU", 1) + ] + "arrays": [ + ("__fp16", "l2_A") + ("__fp16", "l2_Y") + ("__fp16", "l2_W") + ("__fp16", "l2_B") + ] + }, + "mimo_mmse_f16": { "type": "float16", "defines": [ diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py index ad5557e74..4008bb66c 100644 --- a/software/data/gendatalib.py +++ b/software/data/gendatalib.py @@ -355,6 +355,26 @@ def generate_fconv2d_pointwise(my_type=np.float32, defines={}): return [A, W, B], defines +def generate_ffullyconn(my_type=np.float32, defines={}): + + matrix_M = defines['matrix_M'] # width of input + matrix_N = defines['matrix_N'] # height of input + + W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type) + A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type) + if defines['BIAS'] == 1: + B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type) + else: + B = np.zeros((matrix_M), dtype=my_type) + + B += np.matmul(W, A).astype(my_type) + if defines['RELU'] == 1: + B = np.maximum(B, 0) + Y = B + + return [A, Y, B, W], defines + + def generate_flayernorm(my_type=np.float32, defines={}): # Create matrix diff --git a/software/data/gendatalib_nn.py b/software/data/gendatalib_nn.py new file mode 100644 index 000000000..c69b50f6e --- /dev/null +++ b/software/data/gendatalib_nn.py @@ -0,0 +1,195 @@ +#!/usr/bin/env python3 + +# Copyright 2022 ETH Zurich and University of Bologna. +# Solderpad Hardware License, Version 0.51, see LICENSE for details. +# SPDX-License-Identifier: SHL-0.51 + +# This script generates data for the fp16 matmul. +# Author: Marco Bertuletti + +# The script generates random inputs for the C functions. + +import numpy as np +import math + + +def fconv2d_depthwise(A, W, B): + """Two-dimensional depthwise convolution. + + Uses SAME padding with 0s, a stride of 1 and no dilation. A single output + channel is used per input channel (channel_multiplier=1). + + input: input array with shape (height, width, in_depth) + w: filter array with shape (fd, fd, in_depth) + + Returns a result with shape (height, width, in_depth). + """ + + [matrix_M, matrix_N, matrix_D] = np.shape(A) + kernel_K = np.shape(W)[0] + + padw = kernel_K // 2 + padded_input = np.pad(A, + pad_width=((padw, padw), (padw, padw), (0, 0)), + mode='constant', + constant_values=0) + + for c in range(matrix_D): + # For each input channel separately, apply its corresponsing filter + # to the input. + for i in range(matrix_M): + for j in range(matrix_N): + + for fi in range(kernel_K): + for fj in range(kernel_K): + w_element = W[fi, fj, c] + B[i, j, c] += ( + padded_input[i + fi, j + fj, c] * w_element) + return B + + +def fconv2d_pointwise(A, W, B): + """Depthwise separable convolution. + + Performs a pointwise 1x1 convolution with w_pointwise. + + Uses SAME padding with 0s, a stride of 1 and no dilation. A single output + channel is used per input channel (channel_multiplier=1) in w_depth. + + input: input array with shape (height, width, in_depth) + w_pointwise: pointwise filter array with shape (in_depth, out_depth) + + Returns a result with shape (height, width, out_depth). + """ + # First run the depthwise convolution. Its result has the same shape as + # input. + + [matrix_M, matrix_N, matrix_D] = np.shape(A) + kernel_D = np.shape(W)[1] + + for out_c in range(kernel_D): + + for i in range(matrix_M): + for j in range(matrix_N): + for c in range(matrix_D): + w_element = W[c, out_c] + B[i, j, out_c] += A[i, j, c] * w_element + return B + + +def generate_fconv2d_depthwise_pointwise(my_type=np.float32, defines={}): + + matrix_M = defines['matrix_M'] # width of input + matrix_N = defines['matrix_N'] # height of input + matrix_D = defines['matrix_D'] # depth of input + + kernel_K = defines['kernel_K'] # Width of kernel + kernel_D = defines['kernel_D'] # Channels of kernel + + A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type) + Wd = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type) + Wp = (5 * np.random.rand(matrix_D, kernel_D) - 2.5) + + B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type) + B = fconv2d_depthwise(A, Wd, B) + Bd = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type) + + Bp = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type) + Bp = fconv2d_pointwise(B, Wp, Bp) + A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) + Bp = np.reshape(Bp, (matrix_M * matrix_N * kernel_D)).astype(my_type) + Wd = np.reshape(Wd, (kernel_K * kernel_K * matrix_D)).astype(my_type) + Wp = np.reshape(Wp, (matrix_D * kernel_D), order='F').astype(my_type) + + return [A, Wd, Wp, Bd, Bp], defines + + +def generate_fconv2d_depthwise(my_type=np.float32, defines={}): + + matrix_M = defines['matrix_M'] # width of input + matrix_N = defines['matrix_N'] # height of input + matrix_D = defines['matrix_D'] # depth of input + + kernel_K = defines['kernel_K'] # Channels of kernel + + A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type) + W = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type) + B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type) + + B = fconv2d_depthwise(A, W, B) + + A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) + B = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type) + W = np.reshape(W, (kernel_K * kernel_K * matrix_D)).astype(my_type) + + return [A, W, B], defines + + +def generate_fconv2d_pointwise(my_type=np.float32, defines={}): + + matrix_M = defines['matrix_M'] # width of input + matrix_N = defines['matrix_N'] # height of input + matrix_D = defines['matrix_D'] # depth of input + + kernel_D = defines['kernel_D'] # Channels of kernel + + A = (5 * np.random.rand(matrix_M, matrix_N, matrix_D) - 2.5) + W = (5 * np.random.rand(matrix_D, kernel_D) - 2.5) + A = A.astype(my_type) + W = W.astype(my_type) + B = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type) + + B = fconv2d_pointwise(A, W, B) + + A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) + B = np.reshape(B, (matrix_M * matrix_N * kernel_D)).astype(my_type) + W = np.reshape(W, (matrix_D * kernel_D), order='F').astype(my_type) + + return [A, W, B], defines + + +def generate_ffullyconn(my_type=np.float32, defines={}): + + matrix_M = defines['matrix_M'] # width of input + matrix_N = defines['matrix_N'] # height of input + + W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type) + A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type) + if defines['BIAS'] == 1: + B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type) + else: + B = np.zeros((matrix_M), dtype=my_type) + B += np.matmul(W, A).astype(my_type) + if defines['RELU'] == 1: + Y = np.maximum(B, 0) + else: + Y = B + + W = np.reshape(W, (matrix_M * matrix_N)).astype(my_type) + + return [A, Y, W, B], defines + + +def generate_flayernorm(my_type=np.float32, defines={}): + + # Create matrix + array_N = defines['array_N'] + X = (np.random.rand(array_N)).astype(my_type) + + eps = np.array([0.01], dtype=np.float32) + gamma = np.array([np.random.rand() - 0.5], dtype=np.float32) + beta = np.array([np.random.rand() - 0.5], dtype=np.float32) + + # Compute mean and variance along the last axis + mean = np.mean(X, axis=-1, keepdims=True).astype(my_type) + var = np.var(X, axis=-1, keepdims=True).astype(my_type) + + # Normalize + X_normalized = (X - mean) / np.sqrt(var + eps) + # Scale and shift + Y = gamma * X_normalized + beta + + if defines['RELU'] == 1: + Y = np.maximum(Y, 0) + + return [X, Y, eps, gamma, beta], defines diff --git a/software/kernels/baremetal/mempool_messagep_f16.h b/software/kernels/baremetal/mempool_messagep_f16.h new file mode 100644 index 000000000..3e17cb56c --- /dev/null +++ b/software/kernels/baremetal/mempool_messagep_f16.h @@ -0,0 +1,84 @@ +// Copyright 2021 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Author: Marco Bertuletti, ETH Zurich + +#pragma once +#include "builtins_v2.h" + +void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B, + __fp16 *__restrict__ W, uint32_t M, uint32_t N, + uint32_t bias, uint32_t relu) { + + uint32_t i, j; + v2h a, w; + __fp16 b_f16; + float b; + + for (i = 0; i < M; i++) { + // Initialize accumulator + if (bias) { + b_f16 = B[i]; + asm volatile("fcvt.h.s %0, %1;" : "+r"(b) : "r"(b_f16)); + } else { + b = 0.0f; + } + // Matrix vector multiply + for (j = 0; j < N; j += 2) { + a = *(v2h *)&A[j]; + w = *(v2h *)&W[i * N + j]; + asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a), "r"(w)); + } + // ReLU + b = (b < 0.0f) && (relu == 1) ? 0.0f : b; + // Store output + asm volatile("fcvt.h.s %0, %1;" : "+r"(b_f16) : "r"(b)); + B[i] = b_f16; + } + + return; +} + +void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B, + __fp16 *__restrict__ W, uint32_t M, uint32_t N, + uint32_t bias, uint32_t relu) { + + uint32_t i, j; + v2h w0, w1, w2, w3; + v2h a0, a1, a2, a3; + __fp16 b_f16; + float b; + + for (i = 0; i < M; i++) { + // Initialize accumulator + if (bias) { + b_f16 = B[i]; + asm volatile("fcvt.h.s %0, %1;" : "+r"(b) : "r"(b_f16)); + } else { + b = 0.0f; + } + // Matrix vector multiply + for (j = 0; j < N; j += 2) { + a0 = *(v2h *)&A[j + 0]; + a1 = *(v2h *)&A[j + 2]; + a2 = *(v2h *)&A[j + 4]; + a3 = *(v2h *)&A[j + 6]; + w0 = *(v2h *)&W[i * N + j + 0]; + w1 = *(v2h *)&W[i * N + j + 2]; + w2 = *(v2h *)&W[i * N + j + 4]; + w3 = *(v2h *)&W[i * N + j + 6]; + asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a0), "r"(w0)); + asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a1), "r"(w1)); + asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a2), "r"(w2)); + asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a3), "r"(w3)); + } + // ReLU + b = (b < 0.0f) && (relu == 1) ? 0.0f : b; + // Store output + asm volatile("fcvt.h.s %0, %1;" : "+r"(b_f16) : "r"(b)); + B[i] = b_f16; + } + + return; +}