-
Notifications
You must be signed in to change notification settings - Fork 46
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[software] Add fully_connected_f16 kernel
- Loading branch information
1 parent
76302d8
commit 32224c1
Showing
6 changed files
with
372 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
// Copyright 2021 ETH Zurich and University of Bologna. | ||
// Licensed under the Apache License, Version 2.0, see LICENSE for details. | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
// Author: Marco Bertuletti, ETH Zurich | ||
|
||
#include <stdint.h> | ||
#include <string.h> | ||
|
||
#include "dma.h" | ||
#include "encoding.h" | ||
#include "runtime.h" | ||
#include "synchronization.h" | ||
|
||
#include "baremetal/mempool_checks.h" | ||
#include "baremetal/mempool_messagep_f16.h" | ||
#include "data_messagep_f16.h" | ||
|
||
__fp16 l1_A[matrix_N] | ||
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); | ||
__fp16 l1_W[matrix_M * matrix_N] | ||
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); | ||
__fp16 l1_B[matrix_M] | ||
__attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); | ||
|
||
int main() { | ||
uint32_t core_id = mempool_get_core_id(); | ||
uint32_t num_cores = mempool_get_core_count(); | ||
// Initialize barrier and synchronize | ||
mempool_barrier_init(core_id); | ||
|
||
// Initialize Matrices 1 | ||
if (core_id == 0) { | ||
dma_memcpy_blocking(l1_A, l2_A, (matrix_N) * sizeof(int16_t)); | ||
dma_memcpy_blocking(l1_W, l2_W, (matrix_M, matrix_N) * sizeof(int16_t)); | ||
if (BIAS == 1) { | ||
dma_memcpy_blocking(l1_B, l2_B, (matrix_M) * sizeof(int16_t)); | ||
} | ||
} | ||
mempool_barrier(num_cores); | ||
|
||
if (core_id == 0) { | ||
// Execute function to test. | ||
mempool_start_benchmark(); | ||
fullyconn_f16s_unrolled4(l1_A, l1_B, l1_W, matrix_M, matrix_N, BIAS, RELU); | ||
mempool_stop_benchmark(); | ||
} | ||
mempool_barrier(num_cores); | ||
mempool_check_f16(l1_B, l2_Y, matrix_M, 0.01f, 0); | ||
mempool_barrier(num_cores); | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright 2022 ETH Zurich and University of Bologna. | ||
# Solderpad Hardware License, Version 0.51, see LICENSE for details. | ||
# SPDX-License-Identifier: SHL-0.51 | ||
|
||
# This script generates data for the fp16 matmul. | ||
# Author: Marco Bertuletti <[email protected]> | ||
|
||
# The script generates random inputs for the C functions. | ||
|
||
import numpy as np | ||
import math | ||
|
||
|
||
def fconv2d_depthwise(A, W, B): | ||
"""Two-dimensional depthwise convolution. | ||
Uses SAME padding with 0s, a stride of 1 and no dilation. A single output | ||
channel is used per input channel (channel_multiplier=1). | ||
input: input array with shape (height, width, in_depth) | ||
w: filter array with shape (fd, fd, in_depth) | ||
Returns a result with shape (height, width, in_depth). | ||
""" | ||
|
||
[matrix_M, matrix_N, matrix_D] = np.shape(A) | ||
kernel_K = np.shape(W)[0] | ||
|
||
padw = kernel_K // 2 | ||
padded_input = np.pad(A, | ||
pad_width=((padw, padw), (padw, padw), (0, 0)), | ||
mode='constant', | ||
constant_values=0) | ||
|
||
for c in range(matrix_D): | ||
# For each input channel separately, apply its corresponsing filter | ||
# to the input. | ||
for i in range(matrix_M): | ||
for j in range(matrix_N): | ||
|
||
for fi in range(kernel_K): | ||
for fj in range(kernel_K): | ||
w_element = W[fi, fj, c] | ||
B[i, j, c] += ( | ||
padded_input[i + fi, j + fj, c] * w_element) | ||
return B | ||
|
||
|
||
def fconv2d_pointwise(A, W, B): | ||
"""Depthwise separable convolution. | ||
Performs a pointwise 1x1 convolution with w_pointwise. | ||
Uses SAME padding with 0s, a stride of 1 and no dilation. A single output | ||
channel is used per input channel (channel_multiplier=1) in w_depth. | ||
input: input array with shape (height, width, in_depth) | ||
w_pointwise: pointwise filter array with shape (in_depth, out_depth) | ||
Returns a result with shape (height, width, out_depth). | ||
""" | ||
# First run the depthwise convolution. Its result has the same shape as | ||
# input. | ||
|
||
[matrix_M, matrix_N, matrix_D] = np.shape(A) | ||
kernel_D = np.shape(W)[1] | ||
|
||
for out_c in range(kernel_D): | ||
|
||
for i in range(matrix_M): | ||
for j in range(matrix_N): | ||
for c in range(matrix_D): | ||
w_element = W[c, out_c] | ||
B[i, j, out_c] += A[i, j, c] * w_element | ||
return B | ||
|
||
|
||
def generate_fconv2d_depthwise_pointwise(my_type=np.float32, defines={}): | ||
|
||
matrix_M = defines['matrix_M'] # width of input | ||
matrix_N = defines['matrix_N'] # height of input | ||
matrix_D = defines['matrix_D'] # depth of input | ||
|
||
kernel_K = defines['kernel_K'] # Width of kernel | ||
kernel_D = defines['kernel_D'] # Channels of kernel | ||
|
||
A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type) | ||
Wd = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type) | ||
Wp = (5 * np.random.rand(matrix_D, kernel_D) - 2.5) | ||
|
||
B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type) | ||
B = fconv2d_depthwise(A, Wd, B) | ||
Bd = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type) | ||
|
||
Bp = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type) | ||
Bp = fconv2d_pointwise(B, Wp, Bp) | ||
A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) | ||
Bp = np.reshape(Bp, (matrix_M * matrix_N * kernel_D)).astype(my_type) | ||
Wd = np.reshape(Wd, (kernel_K * kernel_K * matrix_D)).astype(my_type) | ||
Wp = np.reshape(Wp, (matrix_D * kernel_D), order='F').astype(my_type) | ||
|
||
return [A, Wd, Wp, Bd, Bp], defines | ||
|
||
|
||
def generate_fconv2d_depthwise(my_type=np.float32, defines={}): | ||
|
||
matrix_M = defines['matrix_M'] # width of input | ||
matrix_N = defines['matrix_N'] # height of input | ||
matrix_D = defines['matrix_D'] # depth of input | ||
|
||
kernel_K = defines['kernel_K'] # Channels of kernel | ||
|
||
A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type) | ||
W = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type) | ||
B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type) | ||
|
||
B = fconv2d_depthwise(A, W, B) | ||
|
||
A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) | ||
B = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type) | ||
W = np.reshape(W, (kernel_K * kernel_K * matrix_D)).astype(my_type) | ||
|
||
return [A, W, B], defines | ||
|
||
|
||
def generate_fconv2d_pointwise(my_type=np.float32, defines={}): | ||
|
||
matrix_M = defines['matrix_M'] # width of input | ||
matrix_N = defines['matrix_N'] # height of input | ||
matrix_D = defines['matrix_D'] # depth of input | ||
|
||
kernel_D = defines['kernel_D'] # Channels of kernel | ||
|
||
A = (5 * np.random.rand(matrix_M, matrix_N, matrix_D) - 2.5) | ||
W = (5 * np.random.rand(matrix_D, kernel_D) - 2.5) | ||
A = A.astype(my_type) | ||
W = W.astype(my_type) | ||
B = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type) | ||
|
||
B = fconv2d_pointwise(A, W, B) | ||
|
||
A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) | ||
B = np.reshape(B, (matrix_M * matrix_N * kernel_D)).astype(my_type) | ||
W = np.reshape(W, (matrix_D * kernel_D), order='F').astype(my_type) | ||
|
||
return [A, W, B], defines | ||
|
||
|
||
def generate_ffullyconn(my_type=np.float32, defines={}): | ||
|
||
matrix_M = defines['matrix_M'] # width of input | ||
matrix_N = defines['matrix_N'] # height of input | ||
|
||
W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type) | ||
A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type) | ||
if defines['BIAS'] == 1: | ||
B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type) | ||
else: | ||
B = np.zeros((matrix_M), dtype=my_type) | ||
B += np.matmul(W, A).astype(my_type) | ||
if defines['RELU'] == 1: | ||
Y = np.maximum(B, 0) | ||
else: | ||
Y = B | ||
|
||
W = np.reshape(W, (matrix_M * matrix_N)).astype(my_type) | ||
|
||
return [A, Y, W, B], defines | ||
|
||
|
||
def generate_flayernorm(my_type=np.float32, defines={}): | ||
|
||
# Create matrix | ||
array_N = defines['array_N'] | ||
X = (np.random.rand(array_N)).astype(my_type) | ||
|
||
eps = np.array([0.01], dtype=np.float32) | ||
gamma = np.array([np.random.rand() - 0.5], dtype=np.float32) | ||
beta = np.array([np.random.rand() - 0.5], dtype=np.float32) | ||
|
||
# Compute mean and variance along the last axis | ||
mean = np.mean(X, axis=-1, keepdims=True).astype(my_type) | ||
var = np.var(X, axis=-1, keepdims=True).astype(my_type) | ||
|
||
# Normalize | ||
X_normalized = (X - mean) / np.sqrt(var + eps) | ||
# Scale and shift | ||
Y = gamma * X_normalized + beta | ||
|
||
if defines['RELU'] == 1: | ||
Y = np.maximum(Y, 0) | ||
|
||
return [X, Y, eps, gamma, beta], defines |
Oops, something went wrong.