diff --git a/software/apps/baremetal/messagep_f16/main.c b/software/apps/baremetal/messagep_f16/main.c index 8e7ab3d2c..dd1bafb0c 100644 --- a/software/apps/baremetal/messagep_f16/main.c +++ b/software/apps/baremetal/messagep_f16/main.c @@ -16,11 +16,11 @@ #include "baremetal/mempool_messagep_f16.h" #include "data_messagep_f16.h" -__fp16 l1_A[matrix_N] +__fp16 l1_A[matrix_P * matrix_M * matrix_N * matrix_D] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); -__fp16 l1_W[matrix_M * matrix_N] +__fp16 l1_B[matrix_P * matrix_M * matrix_N * matrix_D] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); -__fp16 l1_B[matrix_M] +__fp16 l1_HL[matrix_P * matrix_M * matrix_N * width_HL] __attribute__((aligned(sizeof(int32_t)), section(".l1_prio"))); int main() { @@ -31,22 +31,26 @@ int main() { // Initialize Matrices 1 if (core_id == 0) { - dma_memcpy_blocking(l1_A, l2_A, (matrix_N) * sizeof(int16_t)); - dma_memcpy_blocking(l1_W, l2_W, (matrix_M, matrix_N) * sizeof(int16_t)); - if (BIAS == 1) { - dma_memcpy_blocking(l1_B, l2_B, (matrix_M) * sizeof(int16_t)); - } + dma_memcpy_blocking(l1_A, l2_A, + (matrix_P * matrix_M * matrix_N * matrix_D) * + sizeof(int16_t)); + dma_memcpy_blocking(l1_HL, l2_HL, + (matrix_P * matrix_M * matrix_N * width_HL) * + sizeof(int16_t)); } mempool_barrier(num_cores); if (core_id == 0) { // Execute function to test. mempool_start_benchmark(); - fullyconn_f16s_unrolled4(l1_A, l1_B, l1_W, matrix_M, matrix_N, BIAS, RELU); + messagep_f16s_unrolled4(l1_A, l1_B, matrix_P, matrix_M, matrix_N, matrix_D, + FC_LAYER, l1_HL, l2_W_fc1, l2_W_fc2, width_HL, BIAS, + RELU); mempool_stop_benchmark(); } mempool_barrier(num_cores); - mempool_check_f16(l1_B, l2_Y, matrix_M, 0.01f, 0); + mempool_check_f16(l1_B, l2_B, matrix_P * matrix_M * matrix_N * matrix_D, + 0.01f, 0); mempool_barrier(num_cores); return 0; diff --git a/software/data/gendata_header.py b/software/data/gendata_header.py index 4ae1111c8..ce8c92712 100644 --- a/software/data/gendata_header.py +++ b/software/data/gendata_header.py @@ -197,7 +197,7 @@ def get_type(type_string): "memcpy": {"func": datalib.generate_iarray}, "conv2d_depthwise_f16": {"func": datalib_nn.generate_fconv2d_depthwise_pointwise}, "layernorm_f16": {"func": datalib_nn.generate_flayernorm}, - "messagep_f16": {"func": datalib_nn.generate_ffullyconn}, + "messagep_f16": {"func": datalib_nn.generate_fmessagep}, } # Check if app_name exists in the function map diff --git a/software/data/gendata_params.hjson b/software/data/gendata_params.hjson index 48257ee02..0fdba4138 100644 --- a/software/data/gendata_params.hjson +++ b/software/data/gendata_params.hjson @@ -318,19 +318,40 @@ ] } +// "messagep_f16": { +// "type": "float16", +// "defines": [ +// ("matrix_M", 32) +// ("matrix_N", 32) +// ("BIAS", 1) +// ("RELU", 1) +// ] +// "arrays": [ +// ("__fp16", "l2_A") +// ("__fp16", "l2_Y") +// ("__fp16", "l2_W") +// ("__fp16", "l2_B") +// ] +// }, + "messagep_f16": { "type": "float16", "defines": [ - ("matrix_M", 32) - ("matrix_N", 32) - ("BIAS", 1) - ("RELU", 1) + ("matrix_M", 256) + ("matrix_N", 14) + ("matrix_P", 4) + ("matrix_D", 4) + ("width_HL", 8) + ("FC_LAYER", 1) + ("BIAS", 1) + ("RELU", 1) ] "arrays": [ ("__fp16", "l2_A") - ("__fp16", "l2_Y") - ("__fp16", "l2_W") ("__fp16", "l2_B") + ("__fp16", "l2_HL") + ("__fp16", "l2_W_fc1") + ("__fp16", "l2_W_fc2") ] }, diff --git a/software/data/gendatalib.py b/software/data/gendatalib.py index 4008bb66c..c847140e2 100644 --- a/software/data/gendatalib.py +++ b/software/data/gendatalib.py @@ -220,186 +220,6 @@ def generate_fcmatmul(my_type=np.float32, defines={}): return [A, B, C], defines -def fconv2d_depthwise(A, W, B): - """Two-dimensional depthwise convolution. - - Uses SAME padding with 0s, a stride of 1 and no dilation. A single output - channel is used per input channel (channel_multiplier=1). - - input: input array with shape (height, width, in_depth) - w: filter array with shape (fd, fd, in_depth) - - Returns a result with shape (height, width, in_depth). - """ - - [matrix_M, matrix_N, matrix_D] = np.shape(A) - kernel_K = np.shape(W)[0] - - padw = kernel_K // 2 - padded_input = np.pad(A, - pad_width=((padw, padw), (padw, padw), (0, 0)), - mode='constant', - constant_values=0) - - for c in range(matrix_D): - # For each input channel separately, apply its corresponsing filter - # to the input. - for i in range(matrix_M): - for j in range(matrix_N): - - for fi in range(kernel_K): - for fj in range(kernel_K): - w_element = W[fi, fj, c] - B[i, j, c] += ( - padded_input[i + fi, j + fj, c] * w_element) - return B - - -def fconv2d_pointwise(A, W, B): - """Depthwise separable convolution. - - Performs a pointwise 1x1 convolution with w_pointwise. - - Uses SAME padding with 0s, a stride of 1 and no dilation. A single output - channel is used per input channel (channel_multiplier=1) in w_depth. - - input: input array with shape (height, width, in_depth) - w_pointwise: pointwise filter array with shape (in_depth, out_depth) - - Returns a result with shape (height, width, out_depth). - """ - # First run the depthwise convolution. Its result has the same shape as - # input. - - [matrix_M, matrix_N, matrix_D] = np.shape(A) - kernel_D = np.shape(W)[1] - - for out_c in range(kernel_D): - - for i in range(matrix_M): - for j in range(matrix_N): - for c in range(matrix_D): - w_element = W[c, out_c] - B[i, j, out_c] += A[i, j, c] * w_element - return B - - -def generate_fconv2d_depthwise_pointwise(my_type=np.float32, defines={}): - - matrix_M = defines['matrix_M'] # width of input - matrix_N = defines['matrix_N'] # height of input - matrix_D = defines['matrix_D'] # depth of input - - kernel_K = defines['kernel_K'] # Width of kernel - kernel_D = defines['kernel_D'] # Channels of kernel - - A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type) - Wd = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type) - Wp = (5 * np.random.rand(matrix_D, kernel_D) - 2.5) - - B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type) - B = fconv2d_depthwise(A, Wd, B) - Bd = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type) - - Bp = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type) - Bp = fconv2d_pointwise(B, Wp, Bp) - A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) - Bp = np.reshape(Bp, (matrix_M * matrix_N * kernel_D)).astype(my_type) - Wd = np.reshape(Wd, (kernel_K * kernel_K * matrix_D)).astype(my_type) - Wp = np.reshape(Wp, (matrix_D * kernel_D), order='F').astype(my_type) - - return [A, Wd, Wp, Bd, Bp], defines - - -def generate_fconv2d_depthwise(my_type=np.float32, defines={}): - - matrix_M = defines['matrix_M'] # width of input - matrix_N = defines['matrix_N'] # height of input - matrix_D = defines['matrix_D'] # depth of input - - kernel_K = defines['kernel_K'] # Channels of kernel - - A = np.random.rand(matrix_M, matrix_N, matrix_D).astype(my_type) - W = np.random.rand(kernel_K, kernel_K, matrix_D).astype(my_type) - B = np.zeros((matrix_M, matrix_N, matrix_D), dtype=my_type) - - B = fconv2d_depthwise(A, W, B) - - A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) - B = np.reshape(B, (matrix_M * matrix_N * matrix_D)).astype(my_type) - W = np.reshape(W, (kernel_K * kernel_K * matrix_D)).astype(my_type) - - return [A, W, B], defines - - -def generate_fconv2d_pointwise(my_type=np.float32, defines={}): - - matrix_M = defines['matrix_M'] # width of input - matrix_N = defines['matrix_N'] # height of input - matrix_D = defines['matrix_D'] # depth of input - - kernel_D = defines['kernel_D'] # Channels of kernel - - A = (5 * np.random.rand(matrix_M, matrix_N, matrix_D) - 2.5) - W = (5 * np.random.rand(matrix_D, kernel_D) - 2.5) - A = A.astype(my_type) - W = W.astype(my_type) - B = np.zeros((matrix_M, matrix_N, kernel_D), dtype=my_type) - - B = fconv2d_pointwise(A, W, B) - - A = np.reshape(A, (matrix_M * matrix_N * matrix_D)).astype(my_type) - B = np.reshape(B, (matrix_M * matrix_N * kernel_D)).astype(my_type) - W = np.reshape(W, (matrix_D * kernel_D), order='F').astype(my_type) - - return [A, W, B], defines - - -def generate_ffullyconn(my_type=np.float32, defines={}): - - matrix_M = defines['matrix_M'] # width of input - matrix_N = defines['matrix_N'] # height of input - - W = (5 * np.random.rand(matrix_M, matrix_N) - 2.5).astype(my_type) - A = (5 * np.random.rand(matrix_N) - 2.5).astype(my_type) - if defines['BIAS'] == 1: - B = (5 * np.random.rand(matrix_M) - 2.5).astype(my_type) - else: - B = np.zeros((matrix_M), dtype=my_type) - - B += np.matmul(W, A).astype(my_type) - if defines['RELU'] == 1: - B = np.maximum(B, 0) - Y = B - - return [A, Y, B, W], defines - - -def generate_flayernorm(my_type=np.float32, defines={}): - - # Create matrix - array_N = defines['array_N'] - X = (np.random.rand(array_N)).astype(my_type) - - eps = np.array([0.01], dtype=np.float32) - gamma = np.array([np.random.rand() - 0.5], dtype=np.float32) - beta = np.array([np.random.rand() - 0.5], dtype=np.float32) - - # Compute mean and variance along the last axis - mean = np.mean(X, axis=-1, keepdims=True).astype(my_type) - var = np.var(X, axis=-1, keepdims=True).astype(my_type) - - # Normalize - X_normalized = (X - mean) / np.sqrt(var + eps) - # Scale and shift - Y = gamma * X_normalized + beta - - if defines['RELU'] == 1: - Y = np.maximum(Y, 0) - - return [X, Y, eps, gamma, beta], defines - - def generate_fmatmul(my_type=np.float32, defines={}): # Create matrix diff --git a/software/data/gendatalib_nn.py b/software/data/gendatalib_nn.py index c69b50f6e..f201a100d 100644 --- a/software/data/gendatalib_nn.py +++ b/software/data/gendatalib_nn.py @@ -193,3 +193,60 @@ def generate_flayernorm(my_type=np.float32, defines={}): Y = np.maximum(Y, 0) return [X, Y, eps, gamma, beta], defines + + +def generate_fmessagep(my_type=np.float32, defines={}): + + matrix_P = defines['matrix_P'] # number of graph nodes + matrix_M = defines['matrix_M'] # width of input + matrix_N = defines['matrix_N'] # height of input + matrix_D = defines['matrix_D'] # depth of input + width_HL = defines['width_HL'] # depth of input + + A = np.random.rand(matrix_P, matrix_M, matrix_N, matrix_D).astype(my_type) + B = np.zeros((matrix_P, matrix_M, matrix_N, matrix_D), dtype=my_type) + + # Outputs and parameters of the hidden-layer + W_fc1 = np.random.rand(matrix_P, width_HL, matrix_D).astype(my_type) + W_fc2 = np.random.rand(matrix_P, matrix_D, width_HL).astype(my_type) + if defines['BIAS'] == 1: + HL = np.random.rand(matrix_P, matrix_M, matrix_N, width_HL) + HL = HL.astype(my_type) + else: + HL = np.zeros((matrix_P, matrix_M, matrix_N, width_HL)) + HL = HL.astype(my_type) + + # Loops over the 2D image + for i in range(matrix_M): + for j in range(matrix_N): + # Loops over the message passing instances + for p in range(matrix_P): + + if defines['FC_LAYER'] == 1: + # Apply hidden-layer + HL[p, i, j, :] += np.matmul(W_fc1[p, :], A[p, i, j, :]) + if defines['RELU'] == 1: + HL = np.maximum(HL, 0) + A[p, i, j, :] = np.matmul(W_fc2[p, :], HL[p, i, j, :]) + + # Loop over depth and sum the message passing instances + for d in range(matrix_D): + sum_val = np.float16(0.0) + for np_idx in range(matrix_P): + if np_idx != p: + sum_val += A[np_idx, i, j, d] + + # Divide sum + sum_val = sum_val / np.float16(matrix_P) + B[p, i, j, d] = sum_val + + A = np.reshape(A, (matrix_P * matrix_M * matrix_N * matrix_D)) + B = np.reshape(B, (matrix_P * matrix_M * matrix_N * matrix_D)) + HL = np.reshape(HL, (matrix_P * matrix_M * matrix_N * width_HL)) + W_fc1 = np.reshape(W_fc1, (matrix_P * width_HL * matrix_D)) + W_fc2 = np.reshape(W_fc2, (matrix_P * matrix_D * width_HL)) + + A = A.astype(my_type) + B = B.astype(my_type) + + return [A, B, HL, W_fc1, W_fc2], defines diff --git a/software/kernels/baremetal/mempool_messagep_f16.h b/software/kernels/baremetal/mempool_messagep_f16.h index 3e17cb56c..70be2ce5f 100644 --- a/software/kernels/baremetal/mempool_messagep_f16.h +++ b/software/kernels/baremetal/mempool_messagep_f16.h @@ -7,16 +7,16 @@ #pragma once #include "builtins_v2.h" -void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B, - __fp16 *__restrict__ W, uint32_t M, uint32_t N, - uint32_t bias, uint32_t relu) { +static inline void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B, + __fp16 *__restrict__ W, uint32_t wM, + uint32_t wN, uint32_t bias, uint32_t relu) { uint32_t i, j; v2h a, w; __fp16 b_f16; float b; - for (i = 0; i < M; i++) { + for (i = 0; i < wM; i++) { // Initialize accumulator if (bias) { b_f16 = B[i]; @@ -25,9 +25,9 @@ void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B, b = 0.0f; } // Matrix vector multiply - for (j = 0; j < N; j += 2) { + for (j = 0; j < wN; j += 2) { a = *(v2h *)&A[j]; - w = *(v2h *)&W[i * N + j]; + w = *(v2h *)&W[i * wN + j]; asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a), "r"(w)); } // ReLU @@ -40,9 +40,10 @@ void fullyconn_f16s(__fp16 const *__restrict__ A, __fp16 *B, return; } -void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B, - __fp16 *__restrict__ W, uint32_t M, uint32_t N, - uint32_t bias, uint32_t relu) { +static inline void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, + __fp16 *B, __fp16 *__restrict__ W, + uint32_t wM, uint32_t wN, + uint32_t bias, uint32_t relu) { uint32_t i, j; v2h w0, w1, w2, w3; @@ -50,7 +51,7 @@ void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B, __fp16 b_f16; float b; - for (i = 0; i < M; i++) { + for (i = 0; i < wM; i++) { // Initialize accumulator if (bias) { b_f16 = B[i]; @@ -59,15 +60,15 @@ void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B, b = 0.0f; } // Matrix vector multiply - for (j = 0; j < N; j += 2) { + for (j = 0; j < wN; j += 2) { a0 = *(v2h *)&A[j + 0]; a1 = *(v2h *)&A[j + 2]; a2 = *(v2h *)&A[j + 4]; a3 = *(v2h *)&A[j + 6]; - w0 = *(v2h *)&W[i * N + j + 0]; - w1 = *(v2h *)&W[i * N + j + 2]; - w2 = *(v2h *)&W[i * N + j + 4]; - w3 = *(v2h *)&W[i * N + j + 6]; + w0 = *(v2h *)&W[i * wN + j + 0]; + w1 = *(v2h *)&W[i * wN + j + 2]; + w2 = *(v2h *)&W[i * wN + j + 4]; + w3 = *(v2h *)&W[i * wN + j + 6]; asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a0), "r"(w0)); asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a1), "r"(w1)); asm volatile("vfdotpex.s.h %0, %1, %2;" : "+r"(b) : "r"(a2), "r"(w2)); @@ -82,3 +83,171 @@ void fullyconn_f16s_unrolled4(__fp16 const *__restrict__ A, __fp16 *B, return; } + +/* + The kernel combines the information from matrix_P tensors by averaging over + the matrix_P dimension matrix_P: message passing instances of the tensor + matrix_M: rows of the input tensor (as in 2D matrix) + matrix_N: rows of the input tensor (as in 2D matrix) + matrix_D: depth of the input tensor + + Parameters of optional hiddel layer: + HL: pointer to hiddel layer output + W_fc1: weights of first fully-connected layer + W_fc2: weights of second fully-connected layer + wHL: depth of the hidden-layer + bias: optional bias + relu: optional relu +*/ +void messagep_f16s(__fp16 *A, __fp16 *B, uint32_t matrix_P, uint32_t matrix_M, + uint32_t matrix_N, uint32_t matrix_D, uint32_t fc_layer, + __fp16 __attribute__((unused)) * HL, + __fp16 __attribute__((unused)) * W_fc1, + __fp16 __attribute__((unused)) * W_fc2, + uint32_t __attribute__((unused)) wHL, + uint32_t __attribute__((unused)) bias, + uint32_t __attribute__((unused)) relu) { + + uint32_t p, i, j, d, mp; + v2h a; + v2h sum; + + __fp16 N_f16; + asm volatile("fcvt.h.wu %0, %1" : "+r"(N_f16) : "r"(matrix_P)); + asm volatile("pv.pack %0, %0, %0" : "+r"(N_f16)); + + // Loops over the 2D image + for (i = 0; i < matrix_M; i++) { + for (j = 0; j < matrix_N; j++) { + + // Apply FC-layer + if (fc_layer) { + // Loops over the message passing instances + for (p = 0; p < matrix_P; p++) { + // Compute the dense layer (wHL == depth of the hidden layer) + __fp16 *ptr1 = &A[p * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D]; + __fp16 *ptr2 = &HL[p * matrix_M * matrix_N * matrix_D + + i * matrix_N * wHL + j * wHL]; + fullyconn_f16s(ptr1, ptr2, &W_fc1[p * wHL * matrix_D], wHL, matrix_D, + bias, relu); + fullyconn_f16s(ptr2, ptr1, &W_fc2[p * matrix_D * wHL], matrix_D, wHL, + bias, relu); + } + } + + // Loops over the message passing instances + for (p = 0; p < matrix_P; p++) { + // Loop over depth and sum the message passing instances + for (d = 0; d < matrix_D; d += 2) { + sum = (v2h)0.0f; + for (mp = p + 1; mp < matrix_P; mp++) { + a = *(v2h *)&A[mp * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D + d]; + asm volatile("vfadd.h %0, %0, %1" : "+r"(sum) : "r"(a)); + } + for (mp = 0; mp < p; mp++) { + a = *(v2h *)&A[mp * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D + d]; + asm volatile("vfadd.h %0, %0, %1" : "+r"(sum) : "r"(a)); + } + // Divide sum + asm volatile("vfdiv.h %0, %0, %1" : "+r"(sum) : "r"(N_f16)); + *((v2h *)&B[p * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D + d]) = sum; + } + } + } + } + + return; +} + +void messagep_f16s_unrolled4(__fp16 *A, __fp16 *B, uint32_t matrix_P, + uint32_t matrix_M, uint32_t matrix_N, + uint32_t matrix_D, uint32_t fc_layer, + __fp16 __attribute__((unused)) * HL, + __fp16 __attribute__((unused)) * W_fc1, + __fp16 __attribute__((unused)) * W_fc2, + uint32_t __attribute__((unused)) wHL, + uint32_t __attribute__((unused)) bias, + uint32_t __attribute__((unused)) relu) { + + uint32_t p, i, j, d, mp; + v2h a0, a1, a2, a3; + v2h s0, s1, s2, s3; + + __fp16 N_f16; + asm volatile("fcvt.h.wu %0, %1" : "+r"(N_f16) : "r"(matrix_P)); + asm volatile("pv.pack %0, %0, %0" : "+r"(N_f16)); + + // Loops over the 2D image + for (i = 0; i < matrix_M; i++) { + for (j = 0; j < matrix_N; j++) { + + // Apply FC-layer + if (fc_layer) { + // Loops over the message passing instances + for (p = 0; p < matrix_P; p++) { + // Compute the dense layer (wHL == depth of the hidden layer) + __fp16 *ptr1 = &A[p * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D]; + __fp16 *ptr2 = &HL[p * matrix_M * matrix_N * matrix_D + + i * matrix_N * wHL + j * wHL]; + fullyconn_f16s_unrolled4(ptr1, ptr2, &W_fc1[p * wHL * matrix_D], wHL, + matrix_D, bias, relu); + fullyconn_f16s_unrolled4(ptr2, ptr1, &W_fc2[p * matrix_D * wHL], + matrix_D, wHL, bias, relu); + } + } + + // Loops over the message passing instances + for (p = 0; p < matrix_P; p++) { + // Loop over depth and sum the message passing instances + for (d = 0; d < matrix_D; d += 8) { + s0 = (v2h)0.0f; + s1 = (v2h)0.0f; + s2 = (v2h)0.0f; + s3 = (v2h)0.0f; + for (mp = p + 1; mp < matrix_P; mp++) { + __fp16 *a_ptr = &A[mp * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D]; + a0 = *(v2h *)&a_ptr[d]; + a1 = *(v2h *)&a_ptr[d + 2]; + a2 = *(v2h *)&a_ptr[d + 4]; + a3 = *(v2h *)&a_ptr[d + 6]; + asm volatile("vfadd.h %0, %0, %1" : "+r"(s0) : "r"(a0)); + asm volatile("vfadd.h %0, %0, %1" : "+r"(s1) : "r"(a1)); + asm volatile("vfadd.h %0, %0, %1" : "+r"(s2) : "r"(a2)); + asm volatile("vfadd.h %0, %0, %1" : "+r"(s3) : "r"(a3)); + } + for (mp = 0; mp < p; mp++) { + __fp16 *a_ptr = &A[mp * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D]; + a0 = *(v2h *)&a_ptr[d]; + a1 = *(v2h *)&a_ptr[d + 2]; + a2 = *(v2h *)&a_ptr[d + 4]; + a3 = *(v2h *)&a_ptr[d + 6]; + asm volatile("vfadd.h %0, %0, %1" : "+r"(s0) : "r"(a0)); + asm volatile("vfadd.h %0, %0, %1" : "+r"(s1) : "r"(a1)); + asm volatile("vfadd.h %0, %0, %1" : "+r"(s2) : "r"(a2)); + asm volatile("vfadd.h %0, %0, %1" : "+r"(s3) : "r"(a3)); + } + // Divide sum + asm volatile("vfdiv.h %0, %0, %1" : "+r"(s0) : "r"(N_f16)); + asm volatile("vfdiv.h %0, %0, %1" : "+r"(s1) : "r"(N_f16)); + asm volatile("vfdiv.h %0, %0, %1" : "+r"(s2) : "r"(N_f16)); + asm volatile("vfdiv.h %0, %0, %1" : "+r"(s3) : "r"(N_f16)); + __fp16 *b_ptr = &B[p * matrix_M * matrix_N * matrix_D + + i * matrix_N * matrix_D + j * matrix_D]; + *((v2h *)&b_ptr[d + 0]) = s0; + *((v2h *)&b_ptr[d + 2]) = s1; + *((v2h *)&b_ptr[d + 4]) = s2; + *((v2h *)&b_ptr[d + 6]) = s3; + } + } + } + } + + return; +}