• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved.
3  * Copyright (c) 2019 Nuclei Limited. All rights reserved.
4  *
5  * SPDX-License-Identifier: Apache-2.0
6  *
7  * Licensed under the Apache License, Version 2.0 (the License); you may
8  * not use this file except in compliance with the License.
9  * You may obtain a copy of the License at
10  *
11  * www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
15  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19 
20 /* ----------------------------------------------------------------------
21  * Project:      NMSIS NN Library
22  * Title:        riscv_nnfunctions.h
23  * Description:  Public header file for NMSIS NN Library
24  *
25  * $Date:        13. July 2018
26  * $Revision:    V.1.0.0
27  *
28  * Target Processor: RISC-V Cores
29  * -------------------------------------------------------------------- */
30 
31 /**
32    \mainpage NMSIS NN Software Library
33    *
34    * Introduction
35    * ------------
36    *
37    * This user manual describes the NMSIS NN software library,
38    * a collection of efficient neural network kernels developed to maximize the
39    * performance and minimize the memory footprint of neural networks on Nuclei N processor cores.
40    *
41    * The library is divided into a number of functions each covering a specific category:
42    * - Neural Network Convolution Functions
43    * - Neural Network Activation Functions
44    * - Fully-connected Layer Functions
45    * - Neural Network Pooling Functions
46    * - Softmax Functions
47    * - Neural Network Support Functions
48    *
49    * The library has separate functions for operating on different weight and activation data
50    * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the
51    * kernels are included in the function description. The implementation details are also
52    * described in this paper [1].
53    *
54    * \note Please refer to [NMSIS-NN](../../../nn/index.html)
55    *
56    * Block Diagram
57    * --------
58    * \image html NMSIS-NN-OVERVIEW.PNG
59    *
60    * Examples
61    * --------
62    *
63    * The library ships with a number of examples which demonstrate how to use the library functions.
64    *
65    * Pre-processor Macros
66    * ------------
67    *
68    * Each library project have differant pre-processor macros.
69    *
70    * - RISCV_MATH_DSP:
71    *
72    * Define macro RISCV_MATH_DSP, If the silicon supports DSP instructions.
73    *
74    * - RISCV_NN_TRUNCATE:
75    *
76    * Define macro RISCV_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation.
77    *
78    *
79    * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601
80    */
81 
82 /**
83  * @defgroup groupNN Neural Network Functions
84  * These functions perform basic operations for neural network layers.
85  */
86 
87 #ifndef _RISCV_NNFUNCTIONS_H
88 #define _RISCV_NNFUNCTIONS_H
89 
90 #include "riscv_nnsupportfunctions.h"
91 #include "riscv_nn_tables.h"
92 
93 #define USE_INTRINSIC
94 
95 //#define RISCV_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */
96 
97 #ifdef __cplusplus
98 extern    "C"
99 {
100 #endif
101 
102 /**
103  * @defgroup NNConv Neural Network Convolution Functions
104  *
105  * Perform convolution layer
106  *
107  * The convolution is implemented in 2 steps: im2col and GEMM
108  *
109  * im2col is a process of converting each patch of image data into
110  * a column. After im2col, the convolution is computed as matrix-matrix
111  * multiplication.
112  *
113  * To reduce the memory footprint, the im2col is performed partially.
114  * Each iteration, only a few column (i.e., patches) are generated and
115  * computed with GEMM kernels similar to NMSIS-DSP riscv_mat_mult functions.
116  *
117  */
118 
119   /**
120    * @brief Basic Q7 convolution function
121    * @param[in]       Im_in       pointer to input tensor
122    * @param[in]       dim_im_in   input tensor dimention
123    * @param[in]       ch_im_in    number of input tensor channels
124    * @param[in]       wt          pointer to kernel weights
125    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
126    * @param[in]       dim_kernel  filter kernel size
127    * @param[in]       padding     padding sizes
128    * @param[in]       stride      convolution stride
129    * @param[in]       bias        pointer to bias
130    * @param[in]       bias_shift  amount of left-shift for bias
131    * @param[in]       out_shift   amount of right-shift for output
132    * @param[in,out]   Im_out      pointer to output tensor
133    * @param[in]       dim_im_out  output tensor dimension
134    * @param[in,out]   bufferA     pointer to buffer space for input
135    * @param[in,out]   bufferB     pointer to buffer space for output
136    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
137    *
138    */
139 
140     riscv_status riscv_convolve_HWC_q7_basic(const q7_t * Im_in,
141                                          const uint16_t dim_im_in,
142                                          const uint16_t ch_im_in,
143                                          const q7_t * wt,
144                                          const uint16_t ch_im_out,
145                                          const uint16_t dim_kernel,
146                                          const uint16_t padding,
147                                          const uint16_t stride,
148                                          const q7_t * bias,
149                                          const uint16_t bias_shift,
150                                          const uint16_t out_shift,
151                                          q7_t * Im_out,
152                                          const uint16_t dim_im_out,
153                                          q15_t * bufferA,
154                                          q7_t * bufferB);
155 
156   /**
157    * @brief Basic Q7 convolution function (non-sqaure shape)
158    * @param[in]       Im_in        pointer to input tensor
159    * @param[in]       dim_im_in_x  input tensor dimention x
160    * @param[in]       dim_im_in_y  input tensor dimention y
161    * @param[in]       ch_im_in     number of input tensor channels
162    * @param[in]       wt           pointer to kernel weights
163    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
164    * @param[in]       dim_kernel_x filter kernel size x
165    * @param[in]       dim_kernel_y filter kernel size y
166    * @param[in]       padding_x    padding size x
167    * @param[in]       padding_y    padding size y
168    * @param[in]       stride_x     convolution stride x
169    * @param[in]       stride_y     convolution stride y
170    * @param[in]       bias         pointer to bias
171    * @param[in]       bias_shift   amount of left-shift for bias
172    * @param[in]       out_shift    amount of right-shift for output
173    * @param[in,out]   Im_out       pointer to output tensor
174    * @param[in]       dim_im_out_x output tensor dimension x
175    * @param[in]       dim_im_out_y output tensor dimension y
176    * @param[in,out]   bufferA      pointer to buffer space for input
177    * @param[in,out]   bufferB      pointer to buffer space for output
178    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
179    */
180 
181     riscv_status riscv_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in,
182                                                   const uint16_t dim_im_in_x,
183                                                   const uint16_t dim_im_in_y,
184                                                   const uint16_t ch_im_in,
185                                                   const q7_t * wt,
186                                                   const uint16_t ch_im_out,
187                                                   const uint16_t dim_kernel_x,
188                                                   const uint16_t dim_kernel_y,
189                                                   const uint16_t padding_x,
190                                                   const uint16_t padding_y,
191                                                   const uint16_t stride_x,
192                                                   const uint16_t stride_y,
193                                                   const q7_t * bias,
194                                                   const uint16_t bias_shift,
195                                                   const uint16_t out_shift,
196                                                   q7_t * Im_out,
197                                                   const uint16_t dim_im_out_x,
198                                                   const uint16_t dim_im_out_y,
199                                                   q15_t * bufferA,
200                                                   q7_t * bufferB);
201 
202   /**
203    * @brief Basic Q15 convolution function
204    * @param[in]       Im_in       pointer to input tensor
205    * @param[in]       dim_im_in   input tensor dimention
206    * @param[in]       ch_im_in    number of input tensor channels
207    * @param[in]       wt          pointer to kernel weights
208    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
209    * @param[in]       dim_kernel  filter kernel size
210    * @param[in]       padding     padding sizes
211    * @param[in]       stride      convolution stride
212    * @param[in]       bias        pointer to bias
213    * @param[in]       bias_shift  amount of left-shift for bias
214    * @param[in]       out_shift   amount of right-shift for output
215    * @param[in,out]   Im_out      pointer to output tensor
216    * @param[in]       dim_im_out  output tensor dimension
217    * @param[in,out]   bufferA     pointer to buffer space for input
218    * @param[in,out]   bufferB     pointer to buffer space for output
219    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
220    *
221    */
222 
223     riscv_status riscv_convolve_HWC_q15_basic(const q15_t * Im_in,
224                                           const uint16_t dim_im_in,
225                                           const uint16_t ch_im_in,
226                                           const q15_t * wt,
227                                           const uint16_t ch_im_out,
228                                           const uint16_t dim_kernel,
229                                           const uint16_t padding,
230                                           const uint16_t stride,
231                                           const q15_t * bias,
232                                           const uint16_t bias_shift,
233                                           const uint16_t out_shift,
234                                           q15_t * Im_out,
235                                           const uint16_t dim_im_out,
236                                           q15_t * bufferA,
237                                           q7_t * bufferB);
238 
239   /**
240    * @brief Fast Q7 convolution function
241    * @param[in]       Im_in       pointer to input tensor
242    * @param[in]       dim_im_in   input tensor dimention
243    * @param[in]       ch_im_in    number of input tensor channels
244    * @param[in]       wt          pointer to kernel weights
245    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
246    * @param[in]       dim_kernel  filter kernel size
247    * @param[in]       padding     padding sizes
248    * @param[in]       stride      convolution stride
249    * @param[in]       bias        pointer to bias
250    * @param[in]       bias_shift  amount of left-shift for bias
251    * @param[in]       out_shift   amount of right-shift for output
252    * @param[in,out]   Im_out      pointer to output tensor
253    * @param[in]       dim_im_out  output tensor dimension
254    * @param[in,out]   bufferA     pointer to buffer space for input
255    * @param[in,out]   bufferB     pointer to buffer space for output
256    * @return     The function returns either
257    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
258    *
259    * This function is the version with full list of optimization tricks, but with
260    * some contraints:
261    *   ch_im_in is multiple of 4
262    *   ch_im_out is multiple of 2
263    */
264 
265     riscv_status riscv_convolve_HWC_q7_fast(const q7_t * Im_in,
266                                         const uint16_t dim_im_in,
267                                         const uint16_t ch_im_in,
268                                         const q7_t * wt,
269                                         const uint16_t ch_im_out,
270                                         const uint16_t dim_kernel,
271                                         const uint16_t padding,
272                                         const uint16_t stride,
273                                         const q7_t * bias,
274                                         const uint16_t bias_shift,
275                                         const uint16_t out_shift,
276                                         q7_t * Im_out,
277                                         const uint16_t dim_im_out,
278                                         q15_t * bufferA,
279                                         q7_t * bufferB);
280 
281   /**
282    * @brief Fast Q7 convolution function (non-sqaure shape)
283    * @param[in]       Im_in        pointer to input tensor
284    * @param[in]       dim_im_in_x  input tensor dimention x
285    * @param[in]       dim_im_in_y  input tensor dimention y
286    * @param[in]       ch_im_in     number of input tensor channels
287    * @param[in]       wt           pointer to kernel weights
288    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
289    * @param[in]       dim_kernel_x filter kernel size x
290    * @param[in]       dim_kernel_y filter kernel size y
291    * @param[in]       padding_x    padding size x
292    * @param[in]       padding_y    padding size y
293    * @param[in]       stride_x     convolution stride x
294    * @param[in]       stride_y     convolution stride y
295    * @param[in]       bias         pointer to bias
296    * @param[in]       bias_shift   amount of left-shift for bias
297    * @param[in]       out_shift    amount of right-shift for output
298    * @param[in,out]   Im_out       pointer to output tensor
299    * @param[in]       dim_im_out_x output tensor dimension x
300    * @param[in]       dim_im_out_y output tensor dimension y
301    * @param[in,out]   bufferA      pointer to buffer space for input
302    * @param[in,out]   bufferB      pointer to buffer space for output
303    * @return     The function returns either
304    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
305    *
306    * This function is the version with full list of optimization tricks, but with
307    * some contraints:
308    *   ch_im_in is multiple of 4
309    *   ch_im_out is multiple of 2
310    */
311 
312     riscv_status riscv_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in,
313                                                   const uint16_t dim_im_in_x,
314                                                   const uint16_t dim_im_in_y,
315                                                   const uint16_t ch_im_in,
316                                                   const q7_t * wt,
317                                                   const uint16_t ch_im_out,
318                                                   const uint16_t dim_kernel_x,
319                                                   const uint16_t dim_kernel_y,
320                                                   const uint16_t padding_x,
321                                                   const uint16_t padding_y,
322                                                   const uint16_t stride_x,
323                                                   const uint16_t stride_y,
324                                                   const q7_t * bias,
325                                                   const uint16_t bias_shift,
326                                                   const uint16_t out_shift,
327                                                   q7_t * Im_out,
328                                                   const uint16_t dim_im_out_x,
329                                                   const uint16_t dim_im_out_y,
330                                                   q15_t * bufferA,
331                                                   q7_t * bufferB);
332 
333   /**
334    * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape)
335    * @param[in]       Im_in        pointer to input tensor
336    * @param[in]       dim_im_in_x  input tensor dimention x
337    * @param[in]       dim_im_in_y  input tensor dimention y
338    * @param[in]       ch_im_in     number of input tensor channels
339    * @param[in]       wt           pointer to kernel weights
340    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
341    * @param[in]       dim_kernel_x filter kernel size x
342    * @param[in]       dim_kernel_y filter kernel size y
343    * @param[in]       padding_x    padding size x
344    * @param[in]       padding_y    padding size y
345    * @param[in]       stride_x     convolution stride x
346    * @param[in]       stride_y     convolution stride y
347    * @param[in]       bias         pointer to bias
348    * @param[in]       bias_shift   amount of left-shift for bias
349    * @param[in]       out_shift    amount of right-shift for output
350    * @param[in,out]   Im_out       pointer to output tensor
351    * @param[in]       dim_im_out_x output tensor dimension x
352    * @param[in]       dim_im_out_y output tensor dimension y
353    * @param[in,out]   bufferA      pointer to buffer space for input
354    * @param[in,out]   bufferB      pointer to buffer space for output
355    * @return     The function returns either
356    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
357    *
358    * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1
359    * and dim_kernel_y=1). It can be used for
360    * second half of MobileNets after depthwise separable convolution.
361    *
362    * This function is the version with full list of optimization tricks, but with
363    * some contraints:
364    *   ch_im_in is multiple of 4
365    *   ch_im_out is multiple of 2
366    */
367     riscv_status riscv_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in,
368                                                       const uint16_t dim_im_in_x,
369                                                       const uint16_t dim_im_in_y,
370                                                       const uint16_t ch_im_in,
371                                                       const q7_t * wt,
372                                                       const uint16_t ch_im_out,
373                                                       const uint16_t dim_kernel_x,
374                                                       const uint16_t dim_kernel_y,
375                                                       const uint16_t padding_x,
376                                                       const uint16_t padding_y,
377                                                       const uint16_t stride_x,
378                                                       const uint16_t stride_y,
379                                                       const q7_t * bias,
380                                                       const uint16_t bias_shift,
381                                                       const uint16_t out_shift,
382                                                       q7_t * Im_out,
383                                                       const uint16_t dim_im_out_x,
384                                                       const uint16_t dim_im_out_y,
385                                                       q15_t * bufferA,
386                                                       q7_t * bufferB);
387 
388   /**
389    * @brief Q7 version of convolution for RGB image
390    * @param[in]       Im_in       pointer to input tensor
391    * @param[in]       dim_im_in   input tensor dimention
392    * @param[in]       ch_im_in    number of input tensor channels
393    * @param[in]       wt          pointer to kernel weights
394    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
395    * @param[in]       dim_kernel  filter kernel size
396    * @param[in]       padding     padding sizes
397    * @param[in]       stride      convolution stride
398    * @param[in]       bias        pointer to bias
399    * @param[in]       bias_shift  amount of left-shift for bias
400    * @param[in]       out_shift   amount of right-shift for output
401    * @param[in,out]   Im_out      pointer to output tensor
402    * @param[in]       dim_im_out  output tensor dimension
403    * @param[in,out]   bufferA     pointer to buffer space for input
404    * @param[in,out]   bufferB     pointer to buffer space for output
405    * @return     The function returns either
406    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
407    *
408    * This kernel is written exclusively for convolution with ch_im_in
409    * equals 3. This applies on the first layer of CNNs which has input
410    * image with RGB format.
411    */
412 
413     riscv_status riscv_convolve_HWC_q7_RGB(const q7_t * Im_in,
414                                        const uint16_t dim_im_in,
415                                        const uint16_t ch_im_in,
416                                        const q7_t * wt,
417                                        const uint16_t ch_im_out,
418                                        const uint16_t dim_kernel,
419                                        const uint16_t padding,
420                                        const uint16_t stride,
421                                        const q7_t * bias,
422                                        const uint16_t bias_shift,
423                                        const uint16_t out_shift,
424                                        q7_t * Im_out,
425                                        const uint16_t dim_im_out,
426                                        q15_t * bufferA,
427                                        q7_t * bufferB);
428 
429   /**
430    * @brief Fast Q15 convolution function
431    * @param[in]       Im_in       pointer to input tensor
432    * @param[in]       dim_im_in   input tensor dimention
433    * @param[in]       ch_im_in    number of input tensor channels
434    * @param[in]       wt          pointer to kernel weights
435    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
436    * @param[in]       dim_kernel  filter kernel size
437    * @param[in]       padding     padding sizes
438    * @param[in]       stride      convolution stride
439    * @param[in]       bias        pointer to bias
440    * @param[in]       bias_shift  amount of left-shift for bias
441    * @param[in]       out_shift   amount of right-shift for output
442    * @param[in,out]   Im_out      pointer to output tensor
443    * @param[in]       dim_im_out  output tensor dimension
444    * @param[in,out]   bufferA     pointer to buffer space for input
445    * @param[in,out]   bufferB     pointer to buffer space for output
446    * @return     The function returns either
447    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
448    *
449    * This function is the version with full list of optimization tricks, but with
450    * some contraints:
451    *   ch_im_in is multiple of 2
452    *   ch_im_out is multiple of 2
453    */
454 
455     riscv_status riscv_convolve_HWC_q15_fast(const q15_t * Im_in,
456                                          const uint16_t dim_im_in,
457                                          const uint16_t ch_im_in,
458                                          const q15_t * wt,
459                                          const uint16_t ch_im_out,
460                                          const uint16_t dim_kernel,
461                                          const uint16_t padding,
462                                          const uint16_t stride,
463                                          const q15_t * bias,
464                                          const uint16_t bias_shift,
465                                          const uint16_t out_shift,
466                                          q15_t * Im_out,
467                                          const uint16_t dim_im_out,
468                                          q15_t * bufferA,
469                                          q7_t * bufferB);
470 
471   /**
472    * @brief Fast Q15 convolution function (non-sqaure shape)
473    * @param[in]       Im_in        pointer to input tensor
474    * @param[in]       dim_im_in_x  input tensor dimention x
475    * @param[in]       dim_im_in_y  input tensor dimention y
476    * @param[in]       ch_im_in     number of input tensor channels
477    * @param[in]       wt           pointer to kernel weights
478    * @param[in]       ch_im_out    number of filters, i.e., output tensor channels
479    * @param[in]       dim_kernel_x filter kernel size x
480    * @param[in]       dim_kernel_y filter kernel size y
481    * @param[in]       padding_x    padding size x
482    * @param[in]       padding_y    padding size y
483    * @param[in]       stride_x     convolution stride x
484    * @param[in]       stride_y     convolution stride y
485    * @param[in]       bias         pointer to bias
486    * @param[in]       bias_shift   amount of left-shift for bias
487    * @param[in]       out_shift    amount of right-shift for output
488    * @param[in,out]   Im_out       pointer to output tensor
489    * @param[in]       dim_im_out_x output tensor dimension x
490    * @param[in]       dim_im_out_y output tensor dimension y
491    * @param[in,out]   bufferA      pointer to buffer space for input
492    * @param[in,out]   bufferB      pointer to buffer space for output
493    * @return     The function returns either
494    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
495    *
496    * @details
497    *
498    * <b>Buffer size:</b>
499    *
500    * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel
501    *
502    * bufferB size: 0
503    *
504    * <b>Input dimension constraints:</b>
505    *
506    * ch_im_in is multiple of 2
507    *
508    * ch_im_out is multipe of 2
509    *
510    */
511 
512     riscv_status
513     riscv_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in,
514                               const uint16_t dim_im_in_x,
515                               const uint16_t dim_im_in_y,
516                               const uint16_t ch_im_in,
517                               const q15_t * wt,
518                               const uint16_t ch_im_out,
519                               const uint16_t dim_kernel_x,
520                               const uint16_t dim_kernel_y,
521                               const uint16_t padding_x,
522                               const uint16_t padding_y,
523                               const uint16_t stride_x,
524                               const uint16_t stride_y,
525                               const q15_t * bias,
526                               const uint16_t bias_shift,
527                               const uint16_t out_shift,
528                               q15_t * Im_out,
529                               const uint16_t dim_im_out_x,
530                               const uint16_t dim_im_out_y,
531                               q15_t * bufferA,
532                               q7_t * bufferB);
533 
534   /**
535    * @brief Q7 depthwise separable convolution function
536    * @param[in]       Im_in       pointer to input tensor
537    * @param[in]       dim_im_in   input tensor dimention
538    * @param[in]       ch_im_in    number of input tensor channels
539    * @param[in]       wt          pointer to kernel weights
540    * @param[in]       ch_im_out   number of filters, i.e., output tensor channels
541    * @param[in]       dim_kernel  filter kernel size
542    * @param[in]       padding     padding sizes
543    * @param[in]       stride      convolution stride
544    * @param[in]       bias        pointer to bias
545    * @param[in]       bias_shift  amount of left-shift for bias
546    * @param[in]       out_shift   amount of right-shift for output
547    * @param[in,out]   Im_out      pointer to output tensor
548    * @param[in]       dim_im_out  output tensor dimension
549    * @param[in,out]   bufferA     pointer to buffer space for input
550    * @param[in,out]   bufferB     pointer to buffer space for output
551    * @return     The function returns either
552    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
553    *
554    * This function is the version with full list of optimization tricks, but with
555    * some contraints:
556    *   ch_im_in is multiple of 2
557    *   ch_im_out is multiple of 2
558    */
559 
560     riscv_status riscv_depthwise_separable_conv_HWC_q7(const q7_t * Im_in,
561                                                    const uint16_t dim_im_in,
562                                                    const uint16_t ch_im_in,
563                                                    const q7_t * wt,
564                                                    const uint16_t ch_im_out,
565                                                    const uint16_t dim_kernel,
566                                                    const uint16_t padding,
567                                                    const uint16_t stride,
568                                                    const q7_t * bias,
569                                                    const uint16_t bias_shift,
570                                                    const uint16_t out_shift,
571                                                    q7_t * Im_out,
572                                                    const uint16_t dim_im_out,
573                                                    q15_t * bufferA,
574                                                    q7_t * bufferB);
575 
576   /**
577    * @brief Q7 depthwise separable convolution function (non-square shape)
578    * @param[in]       Im_in         pointer to input tensor
579    * @param[in]       dim_im_in_x   input tensor dimention x
580    * @param[in]       dim_im_in_y   input tensor dimention y
581    * @param[in]       ch_im_in      number of input tensor channels
582    * @param[in]       wt            pointer to kernel weights
583    * @param[in]       ch_im_out     number of filters, i.e., output tensor channels
584    * @param[in]       dim_kernel_x  filter kernel size x
585    * @param[in]       dim_kernel_y  filter kernel size y
586    * @param[in]       padding_x     padding sizes x
587    * @param[in]       padding_y     padding sizes y
588    * @param[in]       stride_x      convolution stride x
589    * @param[in]       stride_y      convolution stride y
590    * @param[in]       bias          pointer to bias
591    * @param[in]       bias_shift    amount of left-shift for bias
592    * @param[in]       out_shift     amount of right-shift for output
593    * @param[in,out]   Im_out        pointer to output tensor
594    * @param[in]       dim_im_out_x  output tensor dimension x
595    * @param[in]       dim_im_out_y  output tensor dimension y
596    * @param[in,out]   bufferA       pointer to buffer space for input
597    * @param[in,out]   bufferB       pointer to buffer space for output
598    * @return     The function returns either
599    * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking.
600    *
601    * This function is the version with full list of optimization tricks, but with
602    * some contraints:
603    *   ch_im_in is multiple of 2
604    *   ch_im_out is multiple of 2
605    */
606     riscv_status riscv_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in,
607                                                              const uint16_t dim_im_in_x,
608                                                              const uint16_t dim_im_in_y,
609                                                              const uint16_t ch_im_in,
610                                                              const q7_t * wt,
611                                                              const uint16_t ch_im_out,
612                                                              const uint16_t dim_kernel_x,
613                                                              const uint16_t dim_kernel_y,
614                                                              const uint16_t padding_x,
615                                                              const uint16_t padding_y,
616                                                              const uint16_t stride_x,
617                                                              const uint16_t stride_y,
618                                                              const q7_t * bias,
619                                                              const uint16_t bias_shift,
620                                                              const uint16_t out_shift,
621                                                              q7_t * Im_out,
622                                                              const uint16_t dim_im_out_x,
623                                                              const uint16_t dim_im_out_y,
624                                                              q15_t * bufferA,
625                                                              q7_t * bufferB);
626 
627 
628 /**
629  * @defgroup FC Fully-connected Layer Functions
630  *
631  * Perform fully-connected layer
632  *
633  * Fully-connected layer is basically a matrix-vector multiplication
634  * with bias. The matrix is the weights and the input/output vectors
635  * are the activation values. Supported {weight, activation} precisions
636  * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}.
637  *
638  * Here we have two types of kernel functions. The basic function
639  * implements the function using regular GEMV approach. The opt functions
640  * operates with weights in interleaved formats.
641  *
642  */
643 
644   /**
645    * @brief Q7 basic fully-connected layer function
646    * @param[in]       pV          pointer to input vector
647    * @param[in]       pM          pointer to matrix weights
648    * @param[in]       dim_vec     length of the vector
649    * @param[in]       num_of_rows number of rows in weight matrix
650    * @param[in]       bias_shift  amount of left-shift for bias
651    * @param[in]       out_shift   amount of right-shift for output
652    * @param[in]       bias        pointer to bias
653    * @param[in,out]   pOut        pointer to output vector
654    * @param[in,out]   vec_buffer  pointer to buffer space for input
655    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
656    *
657    */
658 
659     riscv_status riscv_fully_connected_q7(const q7_t * pV,
660                                       const q7_t * pM,
661                                       const uint16_t dim_vec,
662                                       const uint16_t num_of_rows,
663                                       const uint16_t bias_shift,
664                                       const uint16_t out_shift,
665                                       const q7_t * bias,
666                                       q7_t * pOut,
667                                       q15_t * vec_buffer);
668 
669   /**
670    * @brief S8 basic fully-connected layer function for TF Lite
671    * @param[in]       pInput                       pointer to pInput vector
672    * @param[in]       pWeight                      pointer to matrix weights
673    * @param[in]       col_dim                      dimension of the input vector
674    * @param[in]       row_dim                      dimension of the output vector
675    * @param[in]       nb_batches                   number of batches
676    * @param[in]       input_offset
677    * @param[in]       filter_offset
678    * @param[in]       out_mult                     requantization parameter
679    * @param[in]       out_shift                    requantization parameter
680    * @param[in]       output_offset
681    * @param[in]       pBias                        pointer to bias
682    * @param[out]      pOut                         pointer to output vector
683    * @param[in]       output_activation_min        for clamping
684    * @param[in]       output_activation_max        for clamping
685    * @param[in,out]   vec_buffer                   pointer to buffer space for pInput
686    * @return          The function returns         RISCV_MATH_SUCCESS
687    *
688    * @details
689    *
690    * <b>Buffer size:</b>
691    *
692    * vec_buffer size: col_dim of word16.
693    *
694    * This basic function is designed to work with regular pWeight
695    * matrix without interleaving.
696    *
697    */
698   riscv_status
699   riscv_fully_connected_s8(const int8_t   *pInput,
700                          const int8_t   *weight,
701                          const uint16_t input_length,
702                          const uint16_t num_rows,
703                          const uint16_t nb_batches,
704                          const int32_t  input_offset,
705                          const int32_t  filter_offset,
706                          const int32_t  out_mult,
707                          const int32_t  out_shift,
708                          const int32_t  output_offset,
709                          const int8_t   *bias,
710                          int8_t         *pOut,
711                          const int32_t  output_activation_min,
712                          const int32_t  output_activation_max,
713                          q15_t          *vec_buffer)  ;
714 
715   /**
716    * @brief Q7 opt fully-connected layer function
717    * @param[in]       pV          pointer to input vector
718    * @param[in]       pM          pointer to matrix weights
719    * @param[in]       dim_vec     length of the vector
720    * @param[in]       num_of_rows number of rows in weight matrix
721    * @param[in]       bias_shift  amount of left-shift for bias
722    * @param[in]       out_shift   amount of right-shift for output
723    * @param[in]       bias        pointer to bias
724    * @param[in,out]   pOut        pointer to output vector
725    * @param[in,out]   vec_buffer  pointer to buffer space for input
726    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
727    *
728    */
729 
730     riscv_status riscv_fully_connected_q7_opt(const q7_t * pV,
731                                           const q7_t * pM,
732                                           const uint16_t dim_vec,
733                                           const uint16_t num_of_rows,
734                                           const uint16_t bias_shift,
735                                           const uint16_t out_shift,
736                                           const q7_t * bias,
737                                           q7_t * pOut,
738                                           q15_t * vec_buffer);
739 
740   /**
741    * @brief Q15 basic fully-connected layer function
742    * @param[in]       pV          pointer to input vector
743    * @param[in]       pM          pointer to matrix weights
744    * @param[in]       dim_vec     length of the vector
745    * @param[in]       num_of_rows number of rows in weight matrix
746    * @param[in]       bias_shift  amount of left-shift for bias
747    * @param[in]       out_shift   amount of right-shift for output
748    * @param[in]       bias        pointer to bias
749    * @param[in,out]   pOut        pointer to output vector
750    * @param[in,out]   vec_buffer  pointer to buffer space for input
751    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
752    *
753    */
754 
755     riscv_status riscv_fully_connected_q15(const q15_t * pV,
756                                        const q15_t * pM,
757                                        const uint16_t dim_vec,
758                                        const uint16_t num_of_rows,
759                                        const uint16_t bias_shift,
760                                        const uint16_t out_shift,
761                                        const q15_t * bias,
762                                        q15_t * pOut,
763                                        q15_t * vec_buffer);
764 
765   /**
766    * @brief Q15 opt fully-connected layer function
767    * @param[in]       pV          pointer to input vector
768    * @param[in]       pM          pointer to matrix weights
769    * @param[in]       dim_vec     length of the vector
770    * @param[in]       num_of_rows number of rows in weight matrix
771    * @param[in]       bias_shift  amount of left-shift for bias
772    * @param[in]       out_shift   amount of right-shift for output
773    * @param[in]       bias        pointer to bias
774    * @param[in,out]   pOut        pointer to output vector
775    * @param[in,out]   vec_buffer  pointer to buffer space for input
776    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
777    *
778    */
779 
780     riscv_status riscv_fully_connected_q15_opt(const q15_t * pV,
781                                            const q15_t * pM,
782                                            const uint16_t dim_vec,
783                                            const uint16_t num_of_rows,
784                                            const uint16_t bias_shift,
785                                            const uint16_t out_shift,
786                                            const q15_t * bias,
787                                            q15_t * pOut,
788                                            q15_t * vec_buffer);
789 
790   /**
791    * @brief Mixed Q15-Q7 fully-connected layer function
792    * @param[in]       pV          pointer to input vector
793    * @param[in]       pM          pointer to matrix weights
794    * @param[in]       dim_vec     length of the vector
795    * @param[in]       num_of_rows number of rows in weight matrix
796    * @param[in]       bias_shift  amount of left-shift for bias
797    * @param[in]       out_shift   amount of right-shift for output
798    * @param[in]       bias        pointer to bias
799    * @param[in,out]   pOut        pointer to output vector
800    * @param[in,out]   vec_buffer  pointer to buffer space for input
801    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
802    *
803    */
804 
805     riscv_status riscv_fully_connected_mat_q7_vec_q15(const q15_t * pV,
806                                                   const q7_t * pM,
807                                                   const uint16_t dim_vec,
808                                                   const uint16_t num_of_rows,
809                                                   const uint16_t bias_shift,
810                                                   const uint16_t out_shift,
811                                                   const q7_t * bias,
812                                                   q15_t * pOut,
813                                                   q15_t * vec_buffer);
814 
815   /**
816    * @brief Mixed Q15-Q7 opt fully-connected layer function
817    * @param[in]       pV          pointer to input vector
818    * @param[in]       pM          pointer to matrix weights
819    * @param[in]       dim_vec     length of the vector
820    * @param[in]       num_of_rows number of rows in weight matrix
821    * @param[in]       bias_shift  amount of left-shift for bias
822    * @param[in]       out_shift   amount of right-shift for output
823    * @param[in]       bias        pointer to bias
824    * @param[in,out]   pOut        pointer to output vector
825    * @param[in,out]   vec_buffer  pointer to buffer space for input
826    * @return     The function returns <code>RISCV_MATH_SUCCESS</code>
827    *
828    */
829 
830     riscv_status riscv_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV,
831                                                       const q7_t * pM,
832                                                       const uint16_t dim_vec,
833                                                       const uint16_t num_of_rows,
834                                                       const uint16_t bias_shift,
835                                                       const uint16_t out_shift,
836                                                       const q7_t * bias,
837                                                       q15_t * pOut,
838                                                       q15_t * vec_buffer);
839 
840 /**
841  * @brief Matrix-Multiplication Kernels for Convolution
842  *
843  * These functions are used within convolution layer functions for
844  * matrix multiplication.
845  *
846  * The implementation is similar to NMSIS-DSP riscv_mat_mult functions
847  * with one Q7 and one Q15 operands. The Q15 operand is the im2col
848  * output which is always with 2 columns.
849  *
850  */
851 
852   /**
853    * @brief Matrix-multiplication function for convolution
854    * @param[in]       pA          pointer to operand A
855    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
856    * @param[in]       ch_im_out   numRow of A
857    * @param[in]       numCol_A    numCol of A
858    * @param[in]       bias_shift  amount of left-shift for bias
859    * @param[in]       out_shift   amount of right-shift for output
860    * @param[in]       bias        the bias
861    * @param[in,out]   pOut        pointer to output
862    * @return     The function returns the incremented output pointer
863    */
864 
865     q7_t     *riscv_nn_mat_mult_kernel_q7_q15(const q7_t * pA,
866                                             const q15_t * pInBuffer,
867                                             const uint16_t ch_im_out,
868                                             const uint16_t numCol_A,
869                                             const uint16_t bias_shift,
870                                             const uint16_t out_shift,
871                                             const q7_t * bias,
872                                             q7_t * pOut);
873 
874     q7_t     *riscv_nn_mat_mult_kernel_q7(const q7_t * pA,
875                                             const q7_t * pInBuffer,
876                                             const uint16_t ch_im_out,
877                                             const uint16_t numCol_A,
878                                             const uint16_t bias_shift,
879                                             const uint16_t out_shift,
880                                             const q7_t * bias,
881                                             q7_t * pOut);
882 
883   /**
884    * @brief Matrix-multiplication function for convolution with reordered columns
885    * @param[in]       pA          pointer to operand A
886    * @param[in]       pInBuffer   pointer to operand B, always conssists of 2 vectors
887    * @param[in]       ch_im_out   numRow of A
888    * @param[in]       numCol_A    numCol of A
889    * @param[in]       bias_shift  amount of left-shift for bias
890    * @param[in]       out_shift   amount of right-shift for output
891    * @param[in]       bias        the bias
892    * @param[in,out]   pOut        pointer to output
893    * @return     The function returns the incremented output pointer
894    */
895 
896     q7_t     *riscv_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA,
897                                                       const q15_t * pInBuffer,
898                                                       const uint16_t ch_im_out,
899                                                       const uint16_t numCol_A,
900                                                       const uint16_t bias_shift,
901                                                       const uint16_t out_shift,
902                                                       const q7_t * bias,
903                                                       q7_t * pOut);
904 
905     q7_t     *riscv_nn_mat_mult_kernel_q7_reordered(const q7_t * pA,
906                                                       const q7_t * pInBuffer,
907                                                       const uint16_t ch_im_out,
908                                                       const uint16_t numCol_A,
909                                                       const uint16_t bias_shift,
910                                                       const uint16_t out_shift,
911                                                       const q7_t * bias,
912                                                       q7_t * pOut);
913 
914 #ifdef __cplusplus
915 }
916 #endif
917 
918 /*
919  *  Other functions
920  *  These layers are typically not timing critical
921  *  Basic implementation is supported here
922  */
923 
924 #ifdef __cplusplus
925 extern    "C"
926 {
927 #endif
928 
929 /**
930  * @defgroup Acti Neural Network Activation Functions
931  *
932  * Perform activation layers, including ReLU (Rectified Linear Unit),
933  * sigmoid and tanh
934  *
935  */
936 
937   /**
938    * @brief Q7 RELU function
939    * @param[in,out]   data        pointer to input
940    * @param[in]       size        number of elements
941    * @return none.
942    */
943 
944     void      riscv_relu_q7(q7_t * data, uint16_t size);
945 
946   /**
947    * @brief Q15 RELU function
948    * @param[in,out]   data        pointer to input
949    * @param[in]       size        number of elements
950    * @return none.
951    */
952 
953     void      riscv_relu_q15(q15_t * data, uint16_t size);
954 
955   /**
956    * @brief Q7 neural network activation function using direct table look-up
957    * @param[in,out]   data        pointer to input
958    * @param[in]       size        number of elements
959    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
960    * @param[in]       type        type of activation functions
961    * @return none.
962    */
963 
964     void      riscv_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width,
965                                            riscv_nn_activation_type type);
966 
967   /**
968    * @brief Q15 neural network activation function using direct table look-up
969    * @param[in,out]   data        pointer to input
970    * @param[in]       size        number of elements
971    * @param[in]       int_width   bit-width of the integer part, assume to be smaller than 3
972    * @param[in]       type        type of activation functions
973    * @return none.
974    */
975 
976     void      riscv_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width,
977                                             riscv_nn_activation_type type);
978 
979 /**
980  * @defgroup Pooling Neural Network Pooling Functions
981  *
982  * Perform pooling functions, including max pooling and average pooling
983  *
984  */
985 
986   /**
987    * @brief Q7 max pooling function
988    * @param[in]       Im_in       pointer to input tensor
989    * @param[in]       dim_im_in   input tensor dimention
990    * @param[in]       ch_im_in    number of input tensor channels
991    * @param[in]       dim_kernel  filter kernel size
992    * @param[in]       padding     padding sizes
993    * @param[in]       stride      convolution stride
994    * @param[in]       dim_im_out  output tensor dimension
995    * @param[in,out]   bufferA     pointer to buffer space for input
996    * @param[in,out]   Im_out      pointer to output tensor
997    * @return none.
998    *
999    */
1000 
1001     void      riscv_maxpool_q7_HWC(q7_t * Im_in,
1002                                  const uint16_t dim_im_in,
1003                                  const uint16_t ch_im_in,
1004                                  const uint16_t dim_kernel,
1005                                  const uint16_t padding,
1006                                  const uint16_t stride,
1007                                  const uint16_t dim_im_out,
1008                                  q7_t * bufferA,
1009                                  q7_t * Im_out);
1010 
1011   /**
1012    * @brief Q7 average pooling function
1013    * @param[in]       Im_in       pointer to input tensor
1014    * @param[in]       dim_im_in   input tensor dimention
1015    * @param[in]       ch_im_in    number of input tensor channels
1016    * @param[in]       dim_kernel  filter kernel size
1017    * @param[in]       padding     padding sizes
1018    * @param[in]       stride      convolution stride
1019    * @param[in]       dim_im_out  output tensor dimension
1020    * @param[in,out]   bufferA     pointer to buffer space for input
1021    * @param[in,out]   Im_out      pointer to output tensor
1022    * @return none.
1023    *
1024    */
1025 
1026     void      riscv_avepool_q7_HWC(q7_t * Im_in,
1027                                  const uint16_t dim_im_in,
1028                                  const uint16_t ch_im_in,
1029                                  const uint16_t dim_kernel,
1030                                  const uint16_t padding,
1031                                  const uint16_t stride,
1032                                  const uint16_t dim_im_out,
1033                                  q7_t * bufferA,
1034                                  q7_t * Im_out);
1035 
1036 /**
1037  * @defgroup Softmax Softmax Functions
1038  *
1039  * EXP(2) based softmax function
1040  *
1041  */
1042 
1043   /**
1044    * @brief Q7 softmax function
1045    * @param[in]       vec_in      pointer to input vector
1046    * @param[in]       dim_vec     input vector dimention
1047    * @param[out]      p_out       pointer to output vector
1048    * @return none.
1049    *
1050    */
1051 
1052     void      riscv_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out);
1053 
1054   /**
1055    * @brief Q15 softmax function
1056    * @param[in]       vec_in      pointer to input vector
1057    * @param[in]       dim_vec     input vector dimention
1058    * @param[out]      p_out       pointer to output vector
1059    * @return none.
1060    *
1061    */
1062 
1063     void      riscv_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out);
1064 
1065   /**
1066    * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier
1067    *        and input channels. Unless specified otherwise, arguments are mandatory.
1068    *
1069    * @param[in]     input     Pointer to input tensor
1070    * @param[in]     input_x   Width of input tensor
1071    * @param[in]     input_y   Height of input tensor
1072    * @param[in]     input_ch  Channels in input tensor
1073    * @param[in]     kernel    Pointer to kernel weights
1074    * @param[in]     kernel_x  Width of kernel
1075    * @param[in]     kernel_y  Height of kernel
1076    * @param[in]     ch_mult   Number of channel multiplier
1077    * @param[in]     pad_x     Padding sizes x
1078    * @param[in]     pad_y     Padding sizes y
1079    * @param[in]     stride_x  Convolution stride along the width
1080    * @param[in]     stride_y  Convolution stride along the height
1081    * @param[in]     dilation_x Dilation along width. Not used and intended for future enhancement.
1082    * @param[in]     dilation_y Dilation along height. Not used and intended for future enhancement.
1083    * @param[in]     bias       Pointer to optional bias values. If no bias is
1084    *                           availble, NULL is expected
1085    * @param[in]     input_offset  Input tensor zero offset
1086    * @param[in]     filter_offset Kernel tensor zero offset
1087    * @param[in]     output_offset Output tensor zero offset
1088    * @param[in,out] output        Pointer to output tensor
1089    * @param[in]     output_x  Width of output tensor
1090    * @param[in]     output_y  Height of output tensor
1091    * @param[in]     output_activation_min   Minimum value to clamp the output to. Range : {0, 255}
1092    * @param[in]     output_activation_max   Minimum value to clamp the output to. Range : {0, 255}
1093    * @param[in]     out_shift  Amount of right-shift for output
1094    * @param[in]     out_mult   Output multiplier for requantization
1095    * @return        The function returns one of the following
1096    *                <code>RISCV_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors
1097    *                <code>RISCV_MATH_SUCCESS</code> - Successful operation
1098    *                <code>RISCV_MATH_ARGUMENT_ERROR</code> - Implementation not available
1099    *
1100    * <b> Input constraints</b>
1101    * ch_mult  is multiple of 2
1102    * kernel_x is multiple of 2
1103    *
1104    */
1105     riscv_status riscv_depthwise_conv_u8_basic_ver1(const uint8_t *input,
1106                                                 const uint16_t input_x,
1107                                                 const uint16_t input_y,
1108                                                 const uint16_t input_ch,
1109                                                 const uint8_t *kernel,
1110                                                 const uint16_t kernel_x,
1111                                                 const uint16_t kernel_y,
1112                                                 const int16_t ch_mult,
1113                                                 const int16_t pad_x,
1114                                                 const int16_t pad_y,
1115                                                 const int16_t stride_x,
1116                                                 const int16_t stride_y,
1117                                                 const int16_t dilation_x,
1118                                                 const int16_t dilation_y,
1119                                                 const int32_t *bias,
1120                                                 const int32_t input_offset,
1121                                                 const int32_t filter_offset,
1122                                                 const int32_t output_offset,
1123                                                 uint8_t *output,
1124                                                 const uint16_t output_x,
1125                                                 const uint16_t output_y,
1126                                                 const int32_t output_activation_min,
1127                                                 const int32_t output_activation_max,
1128                                                 const int32_t out_shift,
1129                                                 const int32_t out_mult);
1130 #ifdef __cplusplus
1131 }
1132 #endif
1133 
1134 #endif
1135